structure saas with tools
This commit is contained in:
69
.venv/lib/python3.10/site-packages/fsspec/__init__.py
Normal file
69
.venv/lib/python3.10/site-packages/fsspec/__init__.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from importlib.metadata import entry_points
|
||||
|
||||
from . import caching
|
||||
from ._version import __version__ # noqa: F401
|
||||
from .callbacks import Callback
|
||||
from .compression import available_compressions
|
||||
from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs
|
||||
from .exceptions import FSTimeoutError
|
||||
from .mapping import FSMap, get_mapper
|
||||
from .registry import (
|
||||
available_protocols,
|
||||
filesystem,
|
||||
get_filesystem_class,
|
||||
register_implementation,
|
||||
registry,
|
||||
)
|
||||
from .spec import AbstractFileSystem
|
||||
|
||||
__all__ = [
|
||||
"AbstractFileSystem",
|
||||
"FSTimeoutError",
|
||||
"FSMap",
|
||||
"filesystem",
|
||||
"register_implementation",
|
||||
"get_filesystem_class",
|
||||
"get_fs_token_paths",
|
||||
"get_mapper",
|
||||
"open",
|
||||
"open_files",
|
||||
"open_local",
|
||||
"registry",
|
||||
"caching",
|
||||
"Callback",
|
||||
"available_protocols",
|
||||
"available_compressions",
|
||||
"url_to_fs",
|
||||
]
|
||||
|
||||
|
||||
def process_entries():
|
||||
if entry_points is not None:
|
||||
try:
|
||||
eps = entry_points()
|
||||
except TypeError:
|
||||
pass # importlib-metadata < 0.8
|
||||
else:
|
||||
if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0
|
||||
specs = eps.select(group="fsspec.specs")
|
||||
else:
|
||||
specs = eps.get("fsspec.specs", [])
|
||||
registered_names = {}
|
||||
for spec in specs:
|
||||
err_msg = f"Unable to load filesystem from {spec}"
|
||||
name = spec.name
|
||||
if name in registered_names:
|
||||
continue
|
||||
registered_names[name] = True
|
||||
register_implementation(
|
||||
name,
|
||||
spec.value.replace(":", "."),
|
||||
errtxt=err_msg,
|
||||
# We take our implementations as the ones to overload with if
|
||||
# for some reason we encounter some, may be the same, already
|
||||
# registered
|
||||
clobber=True,
|
||||
)
|
||||
|
||||
|
||||
process_entries()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
21
.venv/lib/python3.10/site-packages/fsspec/_version.py
Normal file
21
.venv/lib/python3.10/site-packages/fsspec/_version.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# file generated by setuptools-scm
|
||||
# don't change, don't track in version control
|
||||
|
||||
__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
|
||||
|
||||
TYPE_CHECKING = False
|
||||
if TYPE_CHECKING:
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
||||
else:
|
||||
VERSION_TUPLE = object
|
||||
|
||||
version: str
|
||||
__version__: str
|
||||
__version_tuple__: VERSION_TUPLE
|
||||
version_tuple: VERSION_TUPLE
|
||||
|
||||
__version__ = version = '2025.3.2'
|
||||
__version_tuple__ = version_tuple = (2025, 3, 2)
|
||||
75
.venv/lib/python3.10/site-packages/fsspec/archive.py
Normal file
75
.venv/lib/python3.10/site-packages/fsspec/archive.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import operator
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.utils import tokenize
|
||||
|
||||
|
||||
class AbstractArchiveFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
A generic superclass for implementing Archive-based filesystems.
|
||||
|
||||
Currently, it is shared amongst
|
||||
:class:`~fsspec.implementations.zip.ZipFileSystem`,
|
||||
:class:`~fsspec.implementations.libarchive.LibArchiveFileSystem` and
|
||||
:class:`~fsspec.implementations.tar.TarFileSystem`.
|
||||
"""
|
||||
|
||||
def __str__(self):
|
||||
return f"<Archive-like object {type(self).__name__} at {id(self)}>"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def ukey(self, path):
|
||||
return tokenize(path, self.fo, self.protocol)
|
||||
|
||||
def _all_dirnames(self, paths):
|
||||
"""Returns *all* directory names for each path in paths, including intermediate
|
||||
ones.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
paths: Iterable of path strings
|
||||
"""
|
||||
if len(paths) == 0:
|
||||
return set()
|
||||
|
||||
dirnames = {self._parent(path) for path in paths} - {self.root_marker}
|
||||
return dirnames | self._all_dirnames(dirnames)
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
self._get_dirs()
|
||||
path = self._strip_protocol(path)
|
||||
if path in {"", "/"} and self.dir_cache:
|
||||
return {"name": "", "type": "directory", "size": 0}
|
||||
if path in self.dir_cache:
|
||||
return self.dir_cache[path]
|
||||
elif path + "/" in self.dir_cache:
|
||||
return self.dir_cache[path + "/"]
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
self._get_dirs()
|
||||
paths = {}
|
||||
for p, f in self.dir_cache.items():
|
||||
p = p.rstrip("/")
|
||||
if "/" in p:
|
||||
root = p.rsplit("/", 1)[0]
|
||||
else:
|
||||
root = ""
|
||||
if root == path.rstrip("/"):
|
||||
paths[p] = f
|
||||
elif all(
|
||||
(a == b)
|
||||
for a, b in zip(path.split("/"), [""] + p.strip("/").split("/"))
|
||||
):
|
||||
# root directory entry
|
||||
ppath = p.rstrip("/").split("/", 1)[0]
|
||||
if ppath not in paths:
|
||||
out = {"name": ppath, "size": 0, "type": "directory"}
|
||||
paths[ppath] = out
|
||||
if detail:
|
||||
out = sorted(paths.values(), key=operator.itemgetter("name"))
|
||||
return out
|
||||
else:
|
||||
return sorted(paths)
|
||||
1110
.venv/lib/python3.10/site-packages/fsspec/asyn.py
Normal file
1110
.venv/lib/python3.10/site-packages/fsspec/asyn.py
Normal file
File diff suppressed because it is too large
Load Diff
1005
.venv/lib/python3.10/site-packages/fsspec/caching.py
Normal file
1005
.venv/lib/python3.10/site-packages/fsspec/caching.py
Normal file
File diff suppressed because it is too large
Load Diff
324
.venv/lib/python3.10/site-packages/fsspec/callbacks.py
Normal file
324
.venv/lib/python3.10/site-packages/fsspec/callbacks.py
Normal file
@@ -0,0 +1,324 @@
|
||||
from functools import wraps
|
||||
|
||||
|
||||
class Callback:
|
||||
"""
|
||||
Base class and interface for callback mechanism
|
||||
|
||||
This class can be used directly for monitoring file transfers by
|
||||
providing ``callback=Callback(hooks=...)`` (see the ``hooks`` argument,
|
||||
below), or subclassed for more specialised behaviour.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size: int (optional)
|
||||
Nominal quantity for the value that corresponds to a complete
|
||||
transfer, e.g., total number of tiles or total number of
|
||||
bytes
|
||||
value: int (0)
|
||||
Starting internal counter value
|
||||
hooks: dict or None
|
||||
A dict of named functions to be called on each update. The signature
|
||||
of these must be ``f(size, value, **kwargs)``
|
||||
"""
|
||||
|
||||
def __init__(self, size=None, value=0, hooks=None, **kwargs):
|
||||
self.size = size
|
||||
self.value = value
|
||||
self.hooks = hooks or {}
|
||||
self.kw = kwargs
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc_args):
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
"""Close callback."""
|
||||
|
||||
def branched(self, path_1, path_2, **kwargs):
|
||||
"""
|
||||
Return callback for child transfers
|
||||
|
||||
If this callback is operating at a higher level, e.g., put, which may
|
||||
trigger transfers that can also be monitored. The function returns a callback
|
||||
that has to be passed to the child method, e.g., put_file,
|
||||
as `callback=` argument.
|
||||
|
||||
The implementation uses `callback.branch` for compatibility.
|
||||
When implementing callbacks, it is recommended to override this function instead
|
||||
of `branch` and avoid calling `super().branched(...)`.
|
||||
|
||||
Prefer using this function over `branch`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_1: str
|
||||
Child's source path
|
||||
path_2: str
|
||||
Child's destination path
|
||||
**kwargs:
|
||||
Arbitrary keyword arguments
|
||||
|
||||
Returns
|
||||
-------
|
||||
callback: Callback
|
||||
A callback instance to be passed to the child method
|
||||
"""
|
||||
self.branch(path_1, path_2, kwargs)
|
||||
# mutate kwargs so that we can force the caller to pass "callback=" explicitly
|
||||
return kwargs.pop("callback", DEFAULT_CALLBACK)
|
||||
|
||||
def branch_coro(self, fn):
|
||||
"""
|
||||
Wraps a coroutine, and pass a new child callback to it.
|
||||
"""
|
||||
|
||||
@wraps(fn)
|
||||
async def func(path1, path2: str, **kwargs):
|
||||
with self.branched(path1, path2, **kwargs) as child:
|
||||
return await fn(path1, path2, callback=child, **kwargs)
|
||||
|
||||
return func
|
||||
|
||||
def set_size(self, size):
|
||||
"""
|
||||
Set the internal maximum size attribute
|
||||
|
||||
Usually called if not initially set at instantiation. Note that this
|
||||
triggers a ``call()``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size: int
|
||||
"""
|
||||
self.size = size
|
||||
self.call()
|
||||
|
||||
def absolute_update(self, value):
|
||||
"""
|
||||
Set the internal value state
|
||||
|
||||
Triggers ``call()``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value: int
|
||||
"""
|
||||
self.value = value
|
||||
self.call()
|
||||
|
||||
def relative_update(self, inc=1):
|
||||
"""
|
||||
Delta increment the internal counter
|
||||
|
||||
Triggers ``call()``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inc: int
|
||||
"""
|
||||
self.value += inc
|
||||
self.call()
|
||||
|
||||
def call(self, hook_name=None, **kwargs):
|
||||
"""
|
||||
Execute hook(s) with current state
|
||||
|
||||
Each function is passed the internal size and current value
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hook_name: str or None
|
||||
If given, execute on this hook
|
||||
kwargs: passed on to (all) hook(s)
|
||||
"""
|
||||
if not self.hooks:
|
||||
return
|
||||
kw = self.kw.copy()
|
||||
kw.update(kwargs)
|
||||
if hook_name:
|
||||
if hook_name not in self.hooks:
|
||||
return
|
||||
return self.hooks[hook_name](self.size, self.value, **kw)
|
||||
for hook in self.hooks.values() or []:
|
||||
hook(self.size, self.value, **kw)
|
||||
|
||||
def wrap(self, iterable):
|
||||
"""
|
||||
Wrap an iterable to call ``relative_update`` on each iterations
|
||||
|
||||
Parameters
|
||||
----------
|
||||
iterable: Iterable
|
||||
The iterable that is being wrapped
|
||||
"""
|
||||
for item in iterable:
|
||||
self.relative_update()
|
||||
yield item
|
||||
|
||||
def branch(self, path_1, path_2, kwargs):
|
||||
"""
|
||||
Set callbacks for child transfers
|
||||
|
||||
If this callback is operating at a higher level, e.g., put, which may
|
||||
trigger transfers that can also be monitored. The passed kwargs are
|
||||
to be *mutated* to add ``callback=``, if this class supports branching
|
||||
to children.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_1: str
|
||||
Child's source path
|
||||
path_2: str
|
||||
Child's destination path
|
||||
kwargs: dict
|
||||
arguments passed to child method, e.g., put_file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
return None
|
||||
|
||||
def no_op(self, *_, **__):
|
||||
pass
|
||||
|
||||
def __getattr__(self, item):
|
||||
"""
|
||||
If undefined methods are called on this class, nothing happens
|
||||
"""
|
||||
return self.no_op
|
||||
|
||||
@classmethod
|
||||
def as_callback(cls, maybe_callback=None):
|
||||
"""Transform callback=... into Callback instance
|
||||
|
||||
For the special value of ``None``, return the global instance of
|
||||
``NoOpCallback``. This is an alternative to including
|
||||
``callback=DEFAULT_CALLBACK`` directly in a method signature.
|
||||
"""
|
||||
if maybe_callback is None:
|
||||
return DEFAULT_CALLBACK
|
||||
return maybe_callback
|
||||
|
||||
|
||||
class NoOpCallback(Callback):
|
||||
"""
|
||||
This implementation of Callback does exactly nothing
|
||||
"""
|
||||
|
||||
def call(self, *args, **kwargs):
|
||||
return None
|
||||
|
||||
|
||||
class DotPrinterCallback(Callback):
|
||||
"""
|
||||
Simple example Callback implementation
|
||||
|
||||
Almost identical to Callback with a hook that prints a char; here we
|
||||
demonstrate how the outer layer may print "#" and the inner layer "."
|
||||
"""
|
||||
|
||||
def __init__(self, chr_to_print="#", **kwargs):
|
||||
self.chr = chr_to_print
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def branch(self, path_1, path_2, kwargs):
|
||||
"""Mutate kwargs to add new instance with different print char"""
|
||||
kwargs["callback"] = DotPrinterCallback(".")
|
||||
|
||||
def call(self, **kwargs):
|
||||
"""Just outputs a character"""
|
||||
print(self.chr, end="")
|
||||
|
||||
|
||||
class TqdmCallback(Callback):
|
||||
"""
|
||||
A callback to display a progress bar using tqdm
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tqdm_kwargs : dict, (optional)
|
||||
Any argument accepted by the tqdm constructor.
|
||||
See the `tqdm doc <https://tqdm.github.io/docs/tqdm/#__init__>`_.
|
||||
Will be forwarded to `tqdm_cls`.
|
||||
tqdm_cls: (optional)
|
||||
subclass of `tqdm.tqdm`. If not passed, it will default to `tqdm.tqdm`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import fsspec
|
||||
>>> from fsspec.callbacks import TqdmCallback
|
||||
>>> fs = fsspec.filesystem("memory")
|
||||
>>> path2distant_data = "/your-path"
|
||||
>>> fs.upload(
|
||||
".",
|
||||
path2distant_data,
|
||||
recursive=True,
|
||||
callback=TqdmCallback(),
|
||||
)
|
||||
|
||||
You can forward args to tqdm using the ``tqdm_kwargs`` parameter.
|
||||
|
||||
>>> fs.upload(
|
||||
".",
|
||||
path2distant_data,
|
||||
recursive=True,
|
||||
callback=TqdmCallback(tqdm_kwargs={"desc": "Your tqdm description"}),
|
||||
)
|
||||
|
||||
You can also customize the progress bar by passing a subclass of `tqdm`.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
class TqdmFormat(tqdm):
|
||||
'''Provides a `total_time` format parameter'''
|
||||
@property
|
||||
def format_dict(self):
|
||||
d = super().format_dict
|
||||
total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1)
|
||||
d.update(total_time=self.format_interval(total_time) + " in total")
|
||||
return d
|
||||
|
||||
>>> with TqdmCallback(
|
||||
tqdm_kwargs={
|
||||
"desc": "desc",
|
||||
"bar_format": "{total_time}: {percentage:.0f}%|{bar}{r_bar}",
|
||||
},
|
||||
tqdm_cls=TqdmFormat,
|
||||
) as callback:
|
||||
fs.upload(".", path2distant_data, recursive=True, callback=callback)
|
||||
"""
|
||||
|
||||
def __init__(self, tqdm_kwargs=None, *args, **kwargs):
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
|
||||
except ImportError as exce:
|
||||
raise ImportError(
|
||||
"Using TqdmCallback requires tqdm to be installed"
|
||||
) from exce
|
||||
|
||||
self._tqdm_cls = kwargs.pop("tqdm_cls", tqdm)
|
||||
self._tqdm_kwargs = tqdm_kwargs or {}
|
||||
self.tqdm = None
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def call(self, *args, **kwargs):
|
||||
if self.tqdm is None:
|
||||
self.tqdm = self._tqdm_cls(total=self.size, **self._tqdm_kwargs)
|
||||
self.tqdm.total = self.size
|
||||
self.tqdm.update(self.value - self.tqdm.n)
|
||||
|
||||
def close(self):
|
||||
if self.tqdm is not None:
|
||||
self.tqdm.close()
|
||||
self.tqdm = None
|
||||
|
||||
def __del__(self):
|
||||
return self.close()
|
||||
|
||||
|
||||
DEFAULT_CALLBACK = _DEFAULT_CALLBACK = NoOpCallback()
|
||||
175
.venv/lib/python3.10/site-packages/fsspec/compression.py
Normal file
175
.venv/lib/python3.10/site-packages/fsspec/compression.py
Normal file
@@ -0,0 +1,175 @@
|
||||
"""Helper functions for a standard streaming compression API"""
|
||||
|
||||
from zipfile import ZipFile
|
||||
|
||||
import fsspec.utils
|
||||
from fsspec.spec import AbstractBufferedFile
|
||||
|
||||
|
||||
def noop_file(file, mode, **kwargs):
|
||||
return file
|
||||
|
||||
|
||||
# TODO: files should also be available as contexts
|
||||
# should be functions of the form func(infile, mode=, **kwargs) -> file-like
|
||||
compr = {None: noop_file}
|
||||
|
||||
|
||||
def register_compression(name, callback, extensions, force=False):
|
||||
"""Register an "inferable" file compression type.
|
||||
|
||||
Registers transparent file compression type for use with fsspec.open.
|
||||
Compression can be specified by name in open, or "infer"-ed for any files
|
||||
ending with the given extensions.
|
||||
|
||||
Args:
|
||||
name: (str) The compression type name. Eg. "gzip".
|
||||
callback: A callable of form (infile, mode, **kwargs) -> file-like.
|
||||
Accepts an input file-like object, the target mode and kwargs.
|
||||
Returns a wrapped file-like object.
|
||||
extensions: (str, Iterable[str]) A file extension, or list of file
|
||||
extensions for which to infer this compression scheme. Eg. "gz".
|
||||
force: (bool) Force re-registration of compression type or extensions.
|
||||
|
||||
Raises:
|
||||
ValueError: If name or extensions already registered, and not force.
|
||||
|
||||
"""
|
||||
if isinstance(extensions, str):
|
||||
extensions = [extensions]
|
||||
|
||||
# Validate registration
|
||||
if name in compr and not force:
|
||||
raise ValueError(f"Duplicate compression registration: {name}")
|
||||
|
||||
for ext in extensions:
|
||||
if ext in fsspec.utils.compressions and not force:
|
||||
raise ValueError(f"Duplicate compression file extension: {ext} ({name})")
|
||||
|
||||
compr[name] = callback
|
||||
|
||||
for ext in extensions:
|
||||
fsspec.utils.compressions[ext] = name
|
||||
|
||||
|
||||
def unzip(infile, mode="rb", filename=None, **kwargs):
|
||||
if "r" not in mode:
|
||||
filename = filename or "file"
|
||||
z = ZipFile(infile, mode="w", **kwargs)
|
||||
fo = z.open(filename, mode="w")
|
||||
fo.close = lambda closer=fo.close: closer() or z.close()
|
||||
return fo
|
||||
z = ZipFile(infile)
|
||||
if filename is None:
|
||||
filename = z.namelist()[0]
|
||||
return z.open(filename, mode="r", **kwargs)
|
||||
|
||||
|
||||
register_compression("zip", unzip, "zip")
|
||||
|
||||
try:
|
||||
from bz2 import BZ2File
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
register_compression("bz2", BZ2File, "bz2")
|
||||
|
||||
try: # pragma: no cover
|
||||
from isal import igzip
|
||||
|
||||
def isal(infile, mode="rb", **kwargs):
|
||||
return igzip.IGzipFile(fileobj=infile, mode=mode, **kwargs)
|
||||
|
||||
register_compression("gzip", isal, "gz")
|
||||
except ImportError:
|
||||
from gzip import GzipFile
|
||||
|
||||
register_compression(
|
||||
"gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz"
|
||||
)
|
||||
|
||||
try:
|
||||
from lzma import LZMAFile
|
||||
|
||||
register_compression("lzma", LZMAFile, "lzma")
|
||||
register_compression("xz", LZMAFile, "xz")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import lzmaffi
|
||||
|
||||
register_compression("lzma", lzmaffi.LZMAFile, "lzma", force=True)
|
||||
register_compression("xz", lzmaffi.LZMAFile, "xz", force=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class SnappyFile(AbstractBufferedFile):
|
||||
def __init__(self, infile, mode, **kwargs):
|
||||
import snappy
|
||||
|
||||
super().__init__(
|
||||
fs=None, path="snappy", mode=mode.strip("b") + "b", size=999999999, **kwargs
|
||||
)
|
||||
self.infile = infile
|
||||
if "r" in mode:
|
||||
self.codec = snappy.StreamDecompressor()
|
||||
else:
|
||||
self.codec = snappy.StreamCompressor()
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
self.buffer.seek(0)
|
||||
out = self.codec.add_chunk(self.buffer.read())
|
||||
self.infile.write(out)
|
||||
return True
|
||||
|
||||
def seek(self, loc, whence=0):
|
||||
raise NotImplementedError("SnappyFile is not seekable")
|
||||
|
||||
def seekable(self):
|
||||
return False
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Get the specified set of bytes from remote"""
|
||||
data = self.infile.read(end - start)
|
||||
return self.codec.decompress(data)
|
||||
|
||||
|
||||
try:
|
||||
import snappy
|
||||
|
||||
snappy.compress(b"")
|
||||
# Snappy may use the .sz file extension, but this is not part of the
|
||||
# standard implementation.
|
||||
register_compression("snappy", SnappyFile, [])
|
||||
|
||||
except (ImportError, NameError, AttributeError):
|
||||
pass
|
||||
|
||||
try:
|
||||
import lz4.frame
|
||||
|
||||
register_compression("lz4", lz4.frame.open, "lz4")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import zstandard as zstd
|
||||
|
||||
def zstandard_file(infile, mode="rb"):
|
||||
if "r" in mode:
|
||||
cctx = zstd.ZstdDecompressor()
|
||||
return cctx.stream_reader(infile)
|
||||
else:
|
||||
cctx = zstd.ZstdCompressor(level=10)
|
||||
return cctx.stream_writer(infile)
|
||||
|
||||
register_compression("zstd", zstandard_file, "zst")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def available_compressions():
|
||||
"""Return a list of the implemented compressions."""
|
||||
return list(compr)
|
||||
131
.venv/lib/python3.10/site-packages/fsspec/config.py
Normal file
131
.venv/lib/python3.10/site-packages/fsspec/config.py
Normal file
@@ -0,0 +1,131 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import configparser
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from typing import Any
|
||||
|
||||
conf: dict[str, dict[str, Any]] = {}
|
||||
default_conf_dir = os.path.join(os.path.expanduser("~"), ".config/fsspec")
|
||||
conf_dir = os.environ.get("FSSPEC_CONFIG_DIR", default_conf_dir)
|
||||
|
||||
|
||||
def set_conf_env(conf_dict, envdict=os.environ):
|
||||
"""Set config values from environment variables
|
||||
|
||||
Looks for variables of the form ``FSSPEC_<protocol>`` and
|
||||
``FSSPEC_<protocol>_<kwarg>``. For ``FSSPEC_<protocol>`` the value is parsed
|
||||
as a json dictionary and used to ``update`` the config of the
|
||||
corresponding protocol. For ``FSSPEC_<protocol>_<kwarg>`` there is no
|
||||
attempt to convert the string value, but the kwarg keys will be lower-cased.
|
||||
|
||||
The ``FSSPEC_<protocol>_<kwarg>`` variables are applied after the
|
||||
``FSSPEC_<protocol>`` ones.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
conf_dict : dict(str, dict)
|
||||
This dict will be mutated
|
||||
envdict : dict-like(str, str)
|
||||
Source for the values - usually the real environment
|
||||
"""
|
||||
kwarg_keys = []
|
||||
for key in envdict:
|
||||
if key.startswith("FSSPEC_") and len(key) > 7 and key[7] != "_":
|
||||
if key.count("_") > 1:
|
||||
kwarg_keys.append(key)
|
||||
continue
|
||||
try:
|
||||
value = json.loads(envdict[key])
|
||||
except json.decoder.JSONDecodeError as ex:
|
||||
warnings.warn(
|
||||
f"Ignoring environment variable {key} due to a parse failure: {ex}"
|
||||
)
|
||||
else:
|
||||
if isinstance(value, dict):
|
||||
_, proto = key.split("_", 1)
|
||||
conf_dict.setdefault(proto.lower(), {}).update(value)
|
||||
else:
|
||||
warnings.warn(
|
||||
f"Ignoring environment variable {key} due to not being a dict:"
|
||||
f" {type(value)}"
|
||||
)
|
||||
elif key.startswith("FSSPEC"):
|
||||
warnings.warn(
|
||||
f"Ignoring environment variable {key} due to having an unexpected name"
|
||||
)
|
||||
|
||||
for key in kwarg_keys:
|
||||
_, proto, kwarg = key.split("_", 2)
|
||||
conf_dict.setdefault(proto.lower(), {})[kwarg.lower()] = envdict[key]
|
||||
|
||||
|
||||
def set_conf_files(cdir, conf_dict):
|
||||
"""Set config values from files
|
||||
|
||||
Scans for INI and JSON files in the given dictionary, and uses their
|
||||
contents to set the config. In case of repeated values, later values
|
||||
win.
|
||||
|
||||
In the case of INI files, all values are strings, and these will not
|
||||
be converted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cdir : str
|
||||
Directory to search
|
||||
conf_dict : dict(str, dict)
|
||||
This dict will be mutated
|
||||
"""
|
||||
if not os.path.isdir(cdir):
|
||||
return
|
||||
allfiles = sorted(os.listdir(cdir))
|
||||
for fn in allfiles:
|
||||
if fn.endswith(".ini"):
|
||||
ini = configparser.ConfigParser()
|
||||
ini.read(os.path.join(cdir, fn))
|
||||
for key in ini:
|
||||
if key == "DEFAULT":
|
||||
continue
|
||||
conf_dict.setdefault(key, {}).update(dict(ini[key]))
|
||||
if fn.endswith(".json"):
|
||||
with open(os.path.join(cdir, fn)) as f:
|
||||
js = json.load(f)
|
||||
for key in js:
|
||||
conf_dict.setdefault(key, {}).update(dict(js[key]))
|
||||
|
||||
|
||||
def apply_config(cls, kwargs, conf_dict=None):
|
||||
"""Supply default values for kwargs when instantiating class
|
||||
|
||||
Augments the passed kwargs, by finding entries in the config dict
|
||||
which match the classes ``.protocol`` attribute (one or more str)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cls : file system implementation
|
||||
kwargs : dict
|
||||
conf_dict : dict of dict
|
||||
Typically this is the global configuration
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict : the modified set of kwargs
|
||||
"""
|
||||
if conf_dict is None:
|
||||
conf_dict = conf
|
||||
protos = cls.protocol if isinstance(cls.protocol, (tuple, list)) else [cls.protocol]
|
||||
kw = {}
|
||||
for proto in protos:
|
||||
# default kwargs from the current state of the config
|
||||
if proto in conf_dict:
|
||||
kw.update(conf_dict[proto])
|
||||
# explicit kwargs always win
|
||||
kw.update(**kwargs)
|
||||
kwargs = kw
|
||||
return kwargs
|
||||
|
||||
|
||||
set_conf_files(conf_dir, conf)
|
||||
set_conf_env(conf)
|
||||
55
.venv/lib/python3.10/site-packages/fsspec/conftest.py
Normal file
55
.venv/lib/python3.10/site-packages/fsspec/conftest.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
import fsspec
|
||||
from fsspec.implementations.cached import CachingFileSystem
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def m():
|
||||
"""
|
||||
Fixture providing a memory filesystem.
|
||||
"""
|
||||
m = fsspec.filesystem("memory")
|
||||
m.store.clear()
|
||||
m.pseudo_dirs.clear()
|
||||
m.pseudo_dirs.append("")
|
||||
try:
|
||||
yield m
|
||||
finally:
|
||||
m.store.clear()
|
||||
m.pseudo_dirs.clear()
|
||||
m.pseudo_dirs.append("")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ftp_writable(tmpdir):
|
||||
"""
|
||||
Fixture providing a writable FTP filesystem.
|
||||
"""
|
||||
pytest.importorskip("pyftpdlib")
|
||||
from fsspec.implementations.ftp import FTPFileSystem
|
||||
|
||||
FTPFileSystem.clear_instance_cache() # remove lingering connections
|
||||
CachingFileSystem.clear_instance_cache()
|
||||
d = str(tmpdir)
|
||||
with open(os.path.join(d, "out"), "wb") as f:
|
||||
f.write(b"hello" * 10000)
|
||||
P = subprocess.Popen(
|
||||
[sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"]
|
||||
)
|
||||
try:
|
||||
time.sleep(1)
|
||||
yield "localhost", 2121, "user", "pass"
|
||||
finally:
|
||||
P.terminate()
|
||||
P.wait()
|
||||
try:
|
||||
shutil.rmtree(tmpdir)
|
||||
except Exception:
|
||||
pass
|
||||
743
.venv/lib/python3.10/site-packages/fsspec/core.py
Normal file
743
.venv/lib/python3.10/site-packages/fsspec/core.py
Normal file
@@ -0,0 +1,743 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from glob import has_magic
|
||||
from pathlib import Path
|
||||
|
||||
# for backwards compat, we export cache things from here too
|
||||
from fsspec.caching import ( # noqa: F401
|
||||
BaseCache,
|
||||
BlockCache,
|
||||
BytesCache,
|
||||
MMapCache,
|
||||
ReadAheadCache,
|
||||
caches,
|
||||
)
|
||||
from fsspec.compression import compr
|
||||
from fsspec.config import conf
|
||||
from fsspec.registry import filesystem, get_filesystem_class
|
||||
from fsspec.utils import (
|
||||
_unstrip_protocol,
|
||||
build_name_function,
|
||||
infer_compression,
|
||||
stringify_path,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("fsspec")
|
||||
|
||||
|
||||
class OpenFile:
|
||||
"""
|
||||
File-like object to be used in a context
|
||||
|
||||
Can layer (buffered) text-mode and compression over any file-system, which
|
||||
are typically binary-only.
|
||||
|
||||
These instances are safe to serialize, as the low-level file object
|
||||
is not created until invoked using ``with``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fs: FileSystem
|
||||
The file system to use for opening the file. Should be a subclass or duck-type
|
||||
with ``fsspec.spec.AbstractFileSystem``
|
||||
path: str
|
||||
Location to open
|
||||
mode: str like 'rb', optional
|
||||
Mode of the opened file
|
||||
compression: str or None, optional
|
||||
Compression to apply
|
||||
encoding: str or None, optional
|
||||
The encoding to use if opened in text mode.
|
||||
errors: str or None, optional
|
||||
How to handle encoding errors if opened in text mode.
|
||||
newline: None or str
|
||||
Passed to TextIOWrapper in text mode, how to handle line endings.
|
||||
autoopen: bool
|
||||
If True, calls open() immediately. Mostly used by pickle
|
||||
pos: int
|
||||
If given and autoopen is True, seek to this location immediately
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
path,
|
||||
mode="rb",
|
||||
compression=None,
|
||||
encoding=None,
|
||||
errors=None,
|
||||
newline=None,
|
||||
):
|
||||
self.fs = fs
|
||||
self.path = path
|
||||
self.mode = mode
|
||||
self.compression = get_compression(path, compression)
|
||||
self.encoding = encoding
|
||||
self.errors = errors
|
||||
self.newline = newline
|
||||
self.fobjects = []
|
||||
|
||||
def __reduce__(self):
|
||||
return (
|
||||
OpenFile,
|
||||
(
|
||||
self.fs,
|
||||
self.path,
|
||||
self.mode,
|
||||
self.compression,
|
||||
self.encoding,
|
||||
self.errors,
|
||||
self.newline,
|
||||
),
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<OpenFile '{self.path}'>"
|
||||
|
||||
def __enter__(self):
|
||||
mode = self.mode.replace("t", "").replace("b", "") + "b"
|
||||
|
||||
try:
|
||||
f = self.fs.open(self.path, mode=mode)
|
||||
except FileNotFoundError as e:
|
||||
if has_magic(self.path):
|
||||
raise FileNotFoundError(
|
||||
"%s not found. The URL contains glob characters: you maybe needed\n"
|
||||
"to pass expand=True in fsspec.open() or the storage_options of \n"
|
||||
"your library. You can also set the config value 'open_expand'\n"
|
||||
"before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
|
||||
self.path,
|
||||
) from e
|
||||
raise
|
||||
|
||||
self.fobjects = [f]
|
||||
|
||||
if self.compression is not None:
|
||||
compress = compr[self.compression]
|
||||
f = compress(f, mode=mode[0])
|
||||
self.fobjects.append(f)
|
||||
|
||||
if "b" not in self.mode:
|
||||
# assume, for example, that 'r' is equivalent to 'rt' as in builtin
|
||||
f = PickleableTextIOWrapper(
|
||||
f, encoding=self.encoding, errors=self.errors, newline=self.newline
|
||||
)
|
||||
self.fobjects.append(f)
|
||||
|
||||
return self.fobjects[-1]
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.close()
|
||||
|
||||
@property
|
||||
def full_name(self):
|
||||
return _unstrip_protocol(self.path, self.fs)
|
||||
|
||||
def open(self):
|
||||
"""Materialise this as a real open file without context
|
||||
|
||||
The OpenFile object should be explicitly closed to avoid enclosed file
|
||||
instances persisting. You must, therefore, keep a reference to the OpenFile
|
||||
during the life of the file-like it generates.
|
||||
"""
|
||||
return self.__enter__()
|
||||
|
||||
def close(self):
|
||||
"""Close all encapsulated file objects"""
|
||||
for f in reversed(self.fobjects):
|
||||
if "r" not in self.mode and not f.closed:
|
||||
f.flush()
|
||||
f.close()
|
||||
self.fobjects.clear()
|
||||
|
||||
|
||||
class OpenFiles(list):
|
||||
"""List of OpenFile instances
|
||||
|
||||
Can be used in a single context, which opens and closes all of the
|
||||
contained files. Normal list access to get the elements works as
|
||||
normal.
|
||||
|
||||
A special case is made for caching filesystems - the files will
|
||||
be down/uploaded together at the start or end of the context, and
|
||||
this may happen concurrently, if the target filesystem supports it.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, mode="rb", fs=None):
|
||||
self.mode = mode
|
||||
self.fs = fs
|
||||
self.files = []
|
||||
super().__init__(*args)
|
||||
|
||||
def __enter__(self):
|
||||
if self.fs is None:
|
||||
raise ValueError("Context has already been used")
|
||||
|
||||
fs = self.fs
|
||||
while True:
|
||||
if hasattr(fs, "open_many"):
|
||||
# check for concurrent cache download; or set up for upload
|
||||
self.files = fs.open_many(self)
|
||||
return self.files
|
||||
if hasattr(fs, "fs") and fs.fs is not None:
|
||||
fs = fs.fs
|
||||
else:
|
||||
break
|
||||
return [s.__enter__() for s in self]
|
||||
|
||||
def __exit__(self, *args):
|
||||
fs = self.fs
|
||||
[s.__exit__(*args) for s in self]
|
||||
if "r" not in self.mode:
|
||||
while True:
|
||||
if hasattr(fs, "open_many"):
|
||||
# check for concurrent cache upload
|
||||
fs.commit_many(self.files)
|
||||
return
|
||||
if hasattr(fs, "fs") and fs.fs is not None:
|
||||
fs = fs.fs
|
||||
else:
|
||||
break
|
||||
|
||||
def __getitem__(self, item):
|
||||
out = super().__getitem__(item)
|
||||
if isinstance(item, slice):
|
||||
return OpenFiles(out, mode=self.mode, fs=self.fs)
|
||||
return out
|
||||
|
||||
def __repr__(self):
|
||||
return f"<List of {len(self)} OpenFile instances>"
|
||||
|
||||
|
||||
def open_files(
|
||||
urlpath,
|
||||
mode="rb",
|
||||
compression=None,
|
||||
encoding="utf8",
|
||||
errors=None,
|
||||
name_function=None,
|
||||
num=1,
|
||||
protocol=None,
|
||||
newline=None,
|
||||
auto_mkdir=True,
|
||||
expand=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Given a path or paths, return a list of ``OpenFile`` objects.
|
||||
|
||||
For writing, a str path must contain the "*" character, which will be filled
|
||||
in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2.
|
||||
|
||||
For either reading or writing, can instead provide explicit list of paths.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
urlpath: string or list
|
||||
Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
|
||||
to read from alternative filesystems. To read from multiple files you
|
||||
can pass a globstring or a list of paths, with the caveat that they
|
||||
must all have the same protocol.
|
||||
mode: 'rb', 'wt', etc.
|
||||
compression: string or None
|
||||
If given, open file using compression codec. Can either be a compression
|
||||
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
|
||||
compression from the filename suffix.
|
||||
encoding: str
|
||||
For text mode only
|
||||
errors: None or str
|
||||
Passed to TextIOWrapper in text mode
|
||||
name_function: function or None
|
||||
if opening a set of files for writing, those files do not yet exist,
|
||||
so we need to generate their names by formatting the urlpath for
|
||||
each sequence number
|
||||
num: int [1]
|
||||
if writing mode, number of files we expect to create (passed to
|
||||
name+function)
|
||||
protocol: str or None
|
||||
If given, overrides the protocol found in the URL.
|
||||
newline: bytes or None
|
||||
Used for line terminator in text mode. If None, uses system default;
|
||||
if blank, uses no translation.
|
||||
auto_mkdir: bool (True)
|
||||
If in write mode, this will ensure the target directory exists before
|
||||
writing, by calling ``fs.mkdirs(exist_ok=True)``.
|
||||
expand: bool
|
||||
**kwargs: dict
|
||||
Extra options that make sense to a particular storage connection, e.g.
|
||||
host, port, username, password, etc.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> files = open_files('2015-*-*.csv') # doctest: +SKIP
|
||||
>>> files = open_files(
|
||||
... 's3://bucket/2015-*-*.csv.gz', compression='gzip'
|
||||
... ) # doctest: +SKIP
|
||||
|
||||
Returns
|
||||
-------
|
||||
An ``OpenFiles`` instance, which is a list of ``OpenFile`` objects that can
|
||||
be used as a single context
|
||||
|
||||
Notes
|
||||
-----
|
||||
For a full list of the available protocols and the implementations that
|
||||
they map across to see the latest online documentation:
|
||||
|
||||
- For implementations built into ``fsspec`` see
|
||||
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
|
||||
- For implementations in separate packages see
|
||||
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
|
||||
"""
|
||||
fs, fs_token, paths = get_fs_token_paths(
|
||||
urlpath,
|
||||
mode,
|
||||
num=num,
|
||||
name_function=name_function,
|
||||
storage_options=kwargs,
|
||||
protocol=protocol,
|
||||
expand=expand,
|
||||
)
|
||||
if fs.protocol == "file":
|
||||
fs.auto_mkdir = auto_mkdir
|
||||
elif "r" not in mode and auto_mkdir:
|
||||
parents = {fs._parent(path) for path in paths}
|
||||
for parent in parents:
|
||||
try:
|
||||
fs.makedirs(parent, exist_ok=True)
|
||||
except PermissionError:
|
||||
pass
|
||||
return OpenFiles(
|
||||
[
|
||||
OpenFile(
|
||||
fs,
|
||||
path,
|
||||
mode=mode,
|
||||
compression=compression,
|
||||
encoding=encoding,
|
||||
errors=errors,
|
||||
newline=newline,
|
||||
)
|
||||
for path in paths
|
||||
],
|
||||
mode=mode,
|
||||
fs=fs,
|
||||
)
|
||||
|
||||
|
||||
def _un_chain(path, kwargs):
|
||||
# Avoid a circular import
|
||||
from fsspec.implementations.cached import CachingFileSystem
|
||||
|
||||
if "::" in path:
|
||||
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
|
||||
bits = []
|
||||
for p in path.split("::"):
|
||||
if "://" in p or x.match(p):
|
||||
bits.append(p)
|
||||
else:
|
||||
bits.append(p + "://")
|
||||
else:
|
||||
bits = [path]
|
||||
# [[url, protocol, kwargs], ...]
|
||||
out = []
|
||||
previous_bit = None
|
||||
kwargs = kwargs.copy()
|
||||
for bit in reversed(bits):
|
||||
protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
|
||||
cls = get_filesystem_class(protocol)
|
||||
extra_kwargs = cls._get_kwargs_from_urls(bit)
|
||||
kws = kwargs.pop(protocol, {})
|
||||
if bit is bits[0]:
|
||||
kws.update(kwargs)
|
||||
kw = dict(
|
||||
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
|
||||
**kws,
|
||||
)
|
||||
bit = cls._strip_protocol(bit)
|
||||
if "target_protocol" not in kw and issubclass(cls, CachingFileSystem):
|
||||
bit = previous_bit
|
||||
out.append((bit, protocol, kw))
|
||||
previous_bit = bit
|
||||
out.reverse()
|
||||
return out
|
||||
|
||||
|
||||
def url_to_fs(url, **kwargs):
|
||||
"""
|
||||
Turn fully-qualified and potentially chained URL into filesystem instance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
The fsspec-compatible URL
|
||||
**kwargs: dict
|
||||
Extra options that make sense to a particular storage connection, e.g.
|
||||
host, port, username, password, etc.
|
||||
|
||||
Returns
|
||||
-------
|
||||
filesystem : FileSystem
|
||||
The new filesystem discovered from ``url`` and created with
|
||||
``**kwargs``.
|
||||
urlpath : str
|
||||
The file-systems-specific URL for ``url``.
|
||||
"""
|
||||
url = stringify_path(url)
|
||||
# non-FS arguments that appear in fsspec.open()
|
||||
# inspect could keep this in sync with open()'s signature
|
||||
known_kwargs = {
|
||||
"compression",
|
||||
"encoding",
|
||||
"errors",
|
||||
"expand",
|
||||
"mode",
|
||||
"name_function",
|
||||
"newline",
|
||||
"num",
|
||||
}
|
||||
kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
|
||||
chain = _un_chain(url, kwargs)
|
||||
inkwargs = {}
|
||||
# Reverse iterate the chain, creating a nested target_* structure
|
||||
for i, ch in enumerate(reversed(chain)):
|
||||
urls, protocol, kw = ch
|
||||
if i == len(chain) - 1:
|
||||
inkwargs = dict(**kw, **inkwargs)
|
||||
continue
|
||||
inkwargs["target_options"] = dict(**kw, **inkwargs)
|
||||
inkwargs["target_protocol"] = protocol
|
||||
inkwargs["fo"] = urls
|
||||
urlpath, protocol, _ = chain[0]
|
||||
fs = filesystem(protocol, **inkwargs)
|
||||
return fs, urlpath
|
||||
|
||||
|
||||
DEFAULT_EXPAND = conf.get("open_expand", False)
|
||||
|
||||
|
||||
def open(
|
||||
urlpath,
|
||||
mode="rb",
|
||||
compression=None,
|
||||
encoding="utf8",
|
||||
errors=None,
|
||||
protocol=None,
|
||||
newline=None,
|
||||
expand=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Given a path or paths, return one ``OpenFile`` object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
urlpath: string or list
|
||||
Absolute or relative filepath. Prefix with a protocol like ``s3://``
|
||||
to read from alternative filesystems. Should not include glob
|
||||
character(s).
|
||||
mode: 'rb', 'wt', etc.
|
||||
compression: string or None
|
||||
If given, open file using compression codec. Can either be a compression
|
||||
name (a key in ``fsspec.compression.compr``) or "infer" to guess the
|
||||
compression from the filename suffix.
|
||||
encoding: str
|
||||
For text mode only
|
||||
errors: None or str
|
||||
Passed to TextIOWrapper in text mode
|
||||
protocol: str or None
|
||||
If given, overrides the protocol found in the URL.
|
||||
newline: bytes or None
|
||||
Used for line terminator in text mode. If None, uses system default;
|
||||
if blank, uses no translation.
|
||||
expand: bool or None
|
||||
Whether to regard file paths containing special glob characters as needing
|
||||
expansion (finding the first match) or absolute. Setting False allows using
|
||||
paths which do embed such characters. If None (default), this argument
|
||||
takes its value from the DEFAULT_EXPAND module variable, which takes
|
||||
its initial value from the "open_expand" config value at startup, which will
|
||||
be False if not set.
|
||||
**kwargs: dict
|
||||
Extra options that make sense to a particular storage connection, e.g.
|
||||
host, port, username, password, etc.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> openfile = open('2015-01-01.csv') # doctest: +SKIP
|
||||
>>> openfile = open(
|
||||
... 's3://bucket/2015-01-01.csv.gz', compression='gzip'
|
||||
... ) # doctest: +SKIP
|
||||
>>> with openfile as f:
|
||||
... df = pd.read_csv(f) # doctest: +SKIP
|
||||
...
|
||||
|
||||
Returns
|
||||
-------
|
||||
``OpenFile`` object.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For a full list of the available protocols and the implementations that
|
||||
they map across to see the latest online documentation:
|
||||
|
||||
- For implementations built into ``fsspec`` see
|
||||
https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations
|
||||
- For implementations in separate packages see
|
||||
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
|
||||
"""
|
||||
expand = DEFAULT_EXPAND if expand is None else expand
|
||||
out = open_files(
|
||||
urlpath=[urlpath],
|
||||
mode=mode,
|
||||
compression=compression,
|
||||
encoding=encoding,
|
||||
errors=errors,
|
||||
protocol=protocol,
|
||||
newline=newline,
|
||||
expand=expand,
|
||||
**kwargs,
|
||||
)
|
||||
if not out:
|
||||
raise FileNotFoundError(urlpath)
|
||||
return out[0]
|
||||
|
||||
|
||||
def open_local(
|
||||
url: str | list[str] | Path | list[Path],
|
||||
mode: str = "rb",
|
||||
**storage_options: dict,
|
||||
) -> str | list[str]:
|
||||
"""Open file(s) which can be resolved to local
|
||||
|
||||
For files which either are local, or get downloaded upon open
|
||||
(e.g., by file caching)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str or list(str)
|
||||
mode: str
|
||||
Must be read mode
|
||||
storage_options:
|
||||
passed on to FS for or used by open_files (e.g., compression)
|
||||
"""
|
||||
if "r" not in mode:
|
||||
raise ValueError("Can only ensure local files when reading")
|
||||
of = open_files(url, mode=mode, **storage_options)
|
||||
if not getattr(of[0].fs, "local_file", False):
|
||||
raise ValueError(
|
||||
"open_local can only be used on a filesystem which"
|
||||
" has attribute local_file=True"
|
||||
)
|
||||
with of as files:
|
||||
paths = [f.name for f in files]
|
||||
if (isinstance(url, str) and not has_magic(url)) or isinstance(url, Path):
|
||||
return paths[0]
|
||||
return paths
|
||||
|
||||
|
||||
def get_compression(urlpath, compression):
|
||||
if compression == "infer":
|
||||
compression = infer_compression(urlpath)
|
||||
if compression is not None and compression not in compr:
|
||||
raise ValueError(f"Compression type {compression} not supported")
|
||||
return compression
|
||||
|
||||
|
||||
def split_protocol(urlpath):
|
||||
"""Return protocol, path pair"""
|
||||
urlpath = stringify_path(urlpath)
|
||||
if "://" in urlpath:
|
||||
protocol, path = urlpath.split("://", 1)
|
||||
if len(protocol) > 1:
|
||||
# excludes Windows paths
|
||||
return protocol, path
|
||||
if urlpath.startswith("data:"):
|
||||
return urlpath.split(":", 1)
|
||||
return None, urlpath
|
||||
|
||||
|
||||
def strip_protocol(urlpath):
|
||||
"""Return only path part of full URL, according to appropriate backend"""
|
||||
protocol, _ = split_protocol(urlpath)
|
||||
cls = get_filesystem_class(protocol)
|
||||
return cls._strip_protocol(urlpath)
|
||||
|
||||
|
||||
def expand_paths_if_needed(paths, mode, num, fs, name_function):
|
||||
"""Expand paths if they have a ``*`` in them (write mode) or any of ``*?[]``
|
||||
in them (read mode).
|
||||
|
||||
:param paths: list of paths
|
||||
mode: str
|
||||
Mode in which to open files.
|
||||
num: int
|
||||
If opening in writing mode, number of files we expect to create.
|
||||
fs: filesystem object
|
||||
name_function: callable
|
||||
If opening in writing mode, this callable is used to generate path
|
||||
names. Names are generated for each partition by
|
||||
``urlpath.replace('*', name_function(partition_index))``.
|
||||
:return: list of paths
|
||||
"""
|
||||
expanded_paths = []
|
||||
paths = list(paths)
|
||||
|
||||
if "w" in mode: # read mode
|
||||
if sum(1 for p in paths if "*" in p) > 1:
|
||||
raise ValueError(
|
||||
"When writing data, only one filename mask can be specified."
|
||||
)
|
||||
num = max(num, len(paths))
|
||||
|
||||
for curr_path in paths:
|
||||
if "*" in curr_path:
|
||||
# expand using name_function
|
||||
expanded_paths.extend(_expand_paths(curr_path, name_function, num))
|
||||
else:
|
||||
expanded_paths.append(curr_path)
|
||||
# if we generated more paths that asked for, trim the list
|
||||
if len(expanded_paths) > num:
|
||||
expanded_paths = expanded_paths[:num]
|
||||
|
||||
else: # read mode
|
||||
for curr_path in paths:
|
||||
if has_magic(curr_path):
|
||||
# expand using glob
|
||||
expanded_paths.extend(fs.glob(curr_path))
|
||||
else:
|
||||
expanded_paths.append(curr_path)
|
||||
|
||||
return expanded_paths
|
||||
|
||||
|
||||
def get_fs_token_paths(
|
||||
urlpath,
|
||||
mode="rb",
|
||||
num=1,
|
||||
name_function=None,
|
||||
storage_options=None,
|
||||
protocol=None,
|
||||
expand=True,
|
||||
):
|
||||
"""Filesystem, deterministic token, and paths from a urlpath and options.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
urlpath: string or iterable
|
||||
Absolute or relative filepath, URL (may include protocols like
|
||||
``s3://``), or globstring pointing to data.
|
||||
mode: str, optional
|
||||
Mode in which to open files.
|
||||
num: int, optional
|
||||
If opening in writing mode, number of files we expect to create.
|
||||
name_function: callable, optional
|
||||
If opening in writing mode, this callable is used to generate path
|
||||
names. Names are generated for each partition by
|
||||
``urlpath.replace('*', name_function(partition_index))``.
|
||||
storage_options: dict, optional
|
||||
Additional keywords to pass to the filesystem class.
|
||||
protocol: str or None
|
||||
To override the protocol specifier in the URL
|
||||
expand: bool
|
||||
Expand string paths for writing, assuming the path is a directory
|
||||
"""
|
||||
if isinstance(urlpath, (list, tuple, set)):
|
||||
if not urlpath:
|
||||
raise ValueError("empty urlpath sequence")
|
||||
urlpath0 = stringify_path(next(iter(urlpath)))
|
||||
else:
|
||||
urlpath0 = stringify_path(urlpath)
|
||||
storage_options = storage_options or {}
|
||||
if protocol:
|
||||
storage_options["protocol"] = protocol
|
||||
chain = _un_chain(urlpath0, storage_options or {})
|
||||
inkwargs = {}
|
||||
# Reverse iterate the chain, creating a nested target_* structure
|
||||
for i, ch in enumerate(reversed(chain)):
|
||||
urls, nested_protocol, kw = ch
|
||||
if i == len(chain) - 1:
|
||||
inkwargs = dict(**kw, **inkwargs)
|
||||
continue
|
||||
inkwargs["target_options"] = dict(**kw, **inkwargs)
|
||||
inkwargs["target_protocol"] = nested_protocol
|
||||
inkwargs["fo"] = urls
|
||||
paths, protocol, _ = chain[0]
|
||||
fs = filesystem(protocol, **inkwargs)
|
||||
if isinstance(urlpath, (list, tuple, set)):
|
||||
pchains = [
|
||||
_un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
|
||||
]
|
||||
if len({pc[1] for pc in pchains}) > 1:
|
||||
raise ValueError("Protocol mismatch getting fs from %s", urlpath)
|
||||
paths = [pc[0] for pc in pchains]
|
||||
else:
|
||||
paths = fs._strip_protocol(paths)
|
||||
if isinstance(paths, (list, tuple, set)):
|
||||
if expand:
|
||||
paths = expand_paths_if_needed(paths, mode, num, fs, name_function)
|
||||
elif not isinstance(paths, list):
|
||||
paths = list(paths)
|
||||
else:
|
||||
if ("w" in mode or "x" in mode) and expand:
|
||||
paths = _expand_paths(paths, name_function, num)
|
||||
elif "*" in paths:
|
||||
paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
|
||||
else:
|
||||
paths = [paths]
|
||||
|
||||
return fs, fs._fs_token, paths
|
||||
|
||||
|
||||
def _expand_paths(path, name_function, num):
|
||||
if isinstance(path, str):
|
||||
if path.count("*") > 1:
|
||||
raise ValueError("Output path spec must contain exactly one '*'.")
|
||||
elif "*" not in path:
|
||||
path = os.path.join(path, "*.part")
|
||||
|
||||
if name_function is None:
|
||||
name_function = build_name_function(num - 1)
|
||||
|
||||
paths = [path.replace("*", name_function(i)) for i in range(num)]
|
||||
if paths != sorted(paths):
|
||||
logger.warning(
|
||||
"In order to preserve order between partitions"
|
||||
" paths created with ``name_function`` should "
|
||||
"sort to partition order"
|
||||
)
|
||||
elif isinstance(path, (tuple, list)):
|
||||
assert len(path) == num
|
||||
paths = list(path)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Path should be either\n"
|
||||
"1. A list of paths: ['foo.json', 'bar.json', ...]\n"
|
||||
"2. A directory: 'foo/\n"
|
||||
"3. A path with a '*' in it: 'foo.*.json'"
|
||||
)
|
||||
return paths
|
||||
|
||||
|
||||
class PickleableTextIOWrapper(io.TextIOWrapper):
|
||||
"""TextIOWrapper cannot be pickled. This solves it.
|
||||
|
||||
Requires that ``buffer`` be pickleable, which all instances of
|
||||
AbstractBufferedFile are.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
buffer,
|
||||
encoding=None,
|
||||
errors=None,
|
||||
newline=None,
|
||||
line_buffering=False,
|
||||
write_through=False,
|
||||
):
|
||||
self.args = buffer, encoding, errors, newline, line_buffering, write_through
|
||||
super().__init__(*self.args)
|
||||
|
||||
def __reduce__(self):
|
||||
return PickleableTextIOWrapper, self.args
|
||||
98
.venv/lib/python3.10/site-packages/fsspec/dircache.py
Normal file
98
.venv/lib/python3.10/site-packages/fsspec/dircache.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import time
|
||||
from collections.abc import MutableMapping
|
||||
from functools import lru_cache
|
||||
|
||||
|
||||
class DirCache(MutableMapping):
|
||||
"""
|
||||
Caching of directory listings, in a structure like::
|
||||
|
||||
{"path0": [
|
||||
{"name": "path0/file0",
|
||||
"size": 123,
|
||||
"type": "file",
|
||||
...
|
||||
},
|
||||
{"name": "path0/file1",
|
||||
},
|
||||
...
|
||||
],
|
||||
"path1": [...]
|
||||
}
|
||||
|
||||
Parameters to this class control listing expiry or indeed turn
|
||||
caching off
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
use_listings_cache=True,
|
||||
listings_expiry_time=None,
|
||||
max_paths=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
use_listings_cache: bool
|
||||
If False, this cache never returns items, but always reports KeyError,
|
||||
and setting items has no effect
|
||||
listings_expiry_time: int or float (optional)
|
||||
Time in seconds that a listing is considered valid. If None,
|
||||
listings do not expire.
|
||||
max_paths: int (optional)
|
||||
The number of most recent listings that are considered valid; 'recent'
|
||||
refers to when the entry was set.
|
||||
"""
|
||||
self._cache = {}
|
||||
self._times = {}
|
||||
if max_paths:
|
||||
self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
|
||||
self.use_listings_cache = use_listings_cache
|
||||
self.listings_expiry_time = listings_expiry_time
|
||||
self.max_paths = max_paths
|
||||
|
||||
def __getitem__(self, item):
|
||||
if self.listings_expiry_time is not None:
|
||||
if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
|
||||
del self._cache[item]
|
||||
if self.max_paths:
|
||||
self._q(item)
|
||||
return self._cache[item] # maybe raises KeyError
|
||||
|
||||
def clear(self):
|
||||
self._cache.clear()
|
||||
|
||||
def __len__(self):
|
||||
return len(self._cache)
|
||||
|
||||
def __contains__(self, item):
|
||||
try:
|
||||
self[item]
|
||||
return True
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if not self.use_listings_cache:
|
||||
return
|
||||
if self.max_paths:
|
||||
self._q(key)
|
||||
self._cache[key] = value
|
||||
if self.listings_expiry_time is not None:
|
||||
self._times[key] = time.time()
|
||||
|
||||
def __delitem__(self, key):
|
||||
del self._cache[key]
|
||||
|
||||
def __iter__(self):
|
||||
entries = list(self._cache)
|
||||
|
||||
return (k for k in entries if k in self)
|
||||
|
||||
def __reduce__(self):
|
||||
return (
|
||||
DirCache,
|
||||
(self.use_listings_cache, self.listings_expiry_time, self.max_paths),
|
||||
)
|
||||
18
.venv/lib/python3.10/site-packages/fsspec/exceptions.py
Normal file
18
.venv/lib/python3.10/site-packages/fsspec/exceptions.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""
|
||||
fsspec user-defined exception classes
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
|
||||
class BlocksizeMismatchError(ValueError):
|
||||
"""
|
||||
Raised when a cached file is opened with a different blocksize than it was
|
||||
written with
|
||||
"""
|
||||
|
||||
|
||||
class FSTimeoutError(asyncio.TimeoutError):
|
||||
"""
|
||||
Raised when a fsspec function timed out occurs
|
||||
"""
|
||||
324
.venv/lib/python3.10/site-packages/fsspec/fuse.py
Normal file
324
.venv/lib/python3.10/site-packages/fsspec/fuse.py
Normal file
@@ -0,0 +1,324 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import stat
|
||||
import threading
|
||||
import time
|
||||
from errno import EIO, ENOENT
|
||||
|
||||
from fuse import FUSE, FuseOSError, LoggingMixIn, Operations
|
||||
|
||||
from fsspec import __version__
|
||||
from fsspec.core import url_to_fs
|
||||
|
||||
logger = logging.getLogger("fsspec.fuse")
|
||||
|
||||
|
||||
class FUSEr(Operations):
|
||||
def __init__(self, fs, path, ready_file=False):
|
||||
self.fs = fs
|
||||
self.cache = {}
|
||||
self.root = path.rstrip("/") + "/"
|
||||
self.counter = 0
|
||||
logger.info("Starting FUSE at %s", path)
|
||||
self._ready_file = ready_file
|
||||
|
||||
def getattr(self, path, fh=None):
|
||||
logger.debug("getattr %s", path)
|
||||
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
|
||||
return {"type": "file", "st_size": 5}
|
||||
|
||||
path = "".join([self.root, path.lstrip("/")]).rstrip("/")
|
||||
try:
|
||||
info = self.fs.info(path)
|
||||
except FileNotFoundError as exc:
|
||||
raise FuseOSError(ENOENT) from exc
|
||||
|
||||
data = {"st_uid": info.get("uid", 1000), "st_gid": info.get("gid", 1000)}
|
||||
perm = info.get("mode", 0o777)
|
||||
|
||||
if info["type"] != "file":
|
||||
data["st_mode"] = stat.S_IFDIR | perm
|
||||
data["st_size"] = 0
|
||||
data["st_blksize"] = 0
|
||||
else:
|
||||
data["st_mode"] = stat.S_IFREG | perm
|
||||
data["st_size"] = info["size"]
|
||||
data["st_blksize"] = 5 * 2**20
|
||||
data["st_nlink"] = 1
|
||||
data["st_atime"] = info["atime"] if "atime" in info else time.time()
|
||||
data["st_ctime"] = info["ctime"] if "ctime" in info else time.time()
|
||||
data["st_mtime"] = info["mtime"] if "mtime" in info else time.time()
|
||||
return data
|
||||
|
||||
def readdir(self, path, fh):
|
||||
logger.debug("readdir %s", path)
|
||||
path = "".join([self.root, path.lstrip("/")])
|
||||
files = self.fs.ls(path, False)
|
||||
files = [os.path.basename(f.rstrip("/")) for f in files]
|
||||
return [".", ".."] + files
|
||||
|
||||
def mkdir(self, path, mode):
|
||||
path = "".join([self.root, path.lstrip("/")])
|
||||
self.fs.mkdir(path)
|
||||
return 0
|
||||
|
||||
def rmdir(self, path):
|
||||
path = "".join([self.root, path.lstrip("/")])
|
||||
self.fs.rmdir(path)
|
||||
return 0
|
||||
|
||||
def read(self, path, size, offset, fh):
|
||||
logger.debug("read %s", (path, size, offset))
|
||||
if self._ready_file and path in ["/.fuse_ready", ".fuse_ready"]:
|
||||
# status indicator
|
||||
return b"ready"
|
||||
|
||||
f = self.cache[fh]
|
||||
f.seek(offset)
|
||||
out = f.read(size)
|
||||
return out
|
||||
|
||||
def write(self, path, data, offset, fh):
|
||||
logger.debug("write %s", (path, offset))
|
||||
f = self.cache[fh]
|
||||
f.seek(offset)
|
||||
f.write(data)
|
||||
return len(data)
|
||||
|
||||
def create(self, path, flags, fi=None):
|
||||
logger.debug("create %s", (path, flags))
|
||||
fn = "".join([self.root, path.lstrip("/")])
|
||||
self.fs.touch(fn) # OS will want to get attributes immediately
|
||||
f = self.fs.open(fn, "wb")
|
||||
self.cache[self.counter] = f
|
||||
self.counter += 1
|
||||
return self.counter - 1
|
||||
|
||||
def open(self, path, flags):
|
||||
logger.debug("open %s", (path, flags))
|
||||
fn = "".join([self.root, path.lstrip("/")])
|
||||
if flags % 2 == 0:
|
||||
# read
|
||||
mode = "rb"
|
||||
else:
|
||||
# write/create
|
||||
mode = "wb"
|
||||
self.cache[self.counter] = self.fs.open(fn, mode)
|
||||
self.counter += 1
|
||||
return self.counter - 1
|
||||
|
||||
def truncate(self, path, length, fh=None):
|
||||
fn = "".join([self.root, path.lstrip("/")])
|
||||
if length != 0:
|
||||
raise NotImplementedError
|
||||
# maybe should be no-op since open with write sets size to zero anyway
|
||||
self.fs.touch(fn)
|
||||
|
||||
def unlink(self, path):
|
||||
fn = "".join([self.root, path.lstrip("/")])
|
||||
try:
|
||||
self.fs.rm(fn, False)
|
||||
except (OSError, FileNotFoundError) as exc:
|
||||
raise FuseOSError(EIO) from exc
|
||||
|
||||
def release(self, path, fh):
|
||||
try:
|
||||
if fh in self.cache:
|
||||
f = self.cache[fh]
|
||||
f.close()
|
||||
self.cache.pop(fh)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return 0
|
||||
|
||||
def chmod(self, path, mode):
|
||||
if hasattr(self.fs, "chmod"):
|
||||
path = "".join([self.root, path.lstrip("/")])
|
||||
return self.fs.chmod(path, mode)
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def run(
|
||||
fs,
|
||||
path,
|
||||
mount_point,
|
||||
foreground=True,
|
||||
threads=False,
|
||||
ready_file=False,
|
||||
ops_class=FUSEr,
|
||||
):
|
||||
"""Mount stuff in a local directory
|
||||
|
||||
This uses fusepy to make it appear as if a given path on an fsspec
|
||||
instance is in fact resident within the local file-system.
|
||||
|
||||
This requires that fusepy by installed, and that FUSE be available on
|
||||
the system (typically requiring a package to be installed with
|
||||
apt, yum, brew, etc.).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fs: file-system instance
|
||||
From one of the compatible implementations
|
||||
path: str
|
||||
Location on that file-system to regard as the root directory to
|
||||
mount. Note that you typically should include the terminating "/"
|
||||
character.
|
||||
mount_point: str
|
||||
An empty directory on the local file-system where the contents of
|
||||
the remote path will appear.
|
||||
foreground: bool
|
||||
Whether or not calling this function will block. Operation will
|
||||
typically be more stable if True.
|
||||
threads: bool
|
||||
Whether or not to create threads when responding to file operations
|
||||
within the mounter directory. Operation will typically be more
|
||||
stable if False.
|
||||
ready_file: bool
|
||||
Whether the FUSE process is ready. The ``.fuse_ready`` file will
|
||||
exist in the ``mount_point`` directory if True. Debugging purpose.
|
||||
ops_class: FUSEr or Subclass of FUSEr
|
||||
To override the default behavior of FUSEr. For Example, logging
|
||||
to file.
|
||||
|
||||
"""
|
||||
func = lambda: FUSE(
|
||||
ops_class(fs, path, ready_file=ready_file),
|
||||
mount_point,
|
||||
nothreads=not threads,
|
||||
foreground=foreground,
|
||||
)
|
||||
if not foreground:
|
||||
th = threading.Thread(target=func)
|
||||
th.daemon = True
|
||||
th.start()
|
||||
return th
|
||||
else: # pragma: no cover
|
||||
try:
|
||||
func()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Mount filesystem from chained URL to MOUNT_POINT.
|
||||
|
||||
Examples:
|
||||
|
||||
python3 -m fsspec.fuse memory /usr/share /tmp/mem
|
||||
|
||||
python3 -m fsspec.fuse local /tmp/source /tmp/local \\
|
||||
-l /tmp/fsspecfuse.log
|
||||
|
||||
You can also mount chained-URLs and use special settings:
|
||||
|
||||
python3 -m fsspec.fuse 'filecache::zip::file://data.zip' \\
|
||||
/ /tmp/zip \\
|
||||
-o 'filecache-cache_storage=/tmp/simplecache'
|
||||
|
||||
You can specify the type of the setting by using `[int]` or `[bool]`,
|
||||
(`true`, `yes`, `1` represents the Boolean value `True`):
|
||||
|
||||
python3 -m fsspec.fuse 'simplecache::ftp://ftp1.at.proftpd.org' \\
|
||||
/historic/packages/RPMS /tmp/ftp \\
|
||||
-o 'simplecache-cache_storage=/tmp/simplecache' \\
|
||||
-o 'simplecache-check_files=false[bool]' \\
|
||||
-o 'ftp-listings_expiry_time=60[int]' \\
|
||||
-o 'ftp-username=anonymous' \\
|
||||
-o 'ftp-password=xieyanbo'
|
||||
"""
|
||||
|
||||
class RawDescriptionArgumentParser(argparse.ArgumentParser):
|
||||
def format_help(self):
|
||||
usage = super().format_help()
|
||||
parts = usage.split("\n\n")
|
||||
parts[1] = self.description.rstrip()
|
||||
return "\n\n".join(parts)
|
||||
|
||||
parser = RawDescriptionArgumentParser(prog="fsspec.fuse", description=main.__doc__)
|
||||
parser.add_argument("--version", action="version", version=__version__)
|
||||
parser.add_argument("url", type=str, help="fs url")
|
||||
parser.add_argument("source_path", type=str, help="source directory in fs")
|
||||
parser.add_argument("mount_point", type=str, help="local directory")
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--option",
|
||||
action="append",
|
||||
help="Any options of protocol included in the chained URL",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l", "--log-file", type=str, help="Logging FUSE debug info (Default: '')"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--foreground",
|
||||
action="store_false",
|
||||
help="Running in foreground or not (Default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threads",
|
||||
action="store_false",
|
||||
help="Running with threads support (Default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--ready-file",
|
||||
action="store_false",
|
||||
help="The `.fuse_ready` file will exist after FUSE is ready. "
|
||||
"(Debugging purpose, Default: False)",
|
||||
)
|
||||
args = parser.parse_args(args)
|
||||
|
||||
kwargs = {}
|
||||
for item in args.option or []:
|
||||
key, sep, value = item.partition("=")
|
||||
if not sep:
|
||||
parser.error(message=f"Wrong option: {item!r}")
|
||||
val = value.lower()
|
||||
if val.endswith("[int]"):
|
||||
value = int(value[: -len("[int]")])
|
||||
elif val.endswith("[bool]"):
|
||||
value = val[: -len("[bool]")] in ["1", "yes", "true"]
|
||||
|
||||
if "-" in key:
|
||||
fs_name, setting_name = key.split("-", 1)
|
||||
if fs_name in kwargs:
|
||||
kwargs[fs_name][setting_name] = value
|
||||
else:
|
||||
kwargs[fs_name] = {setting_name: value}
|
||||
else:
|
||||
kwargs[key] = value
|
||||
|
||||
if args.log_file:
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
filename=args.log_file,
|
||||
format="%(asctime)s %(message)s",
|
||||
)
|
||||
|
||||
class LoggingFUSEr(FUSEr, LoggingMixIn):
|
||||
pass
|
||||
|
||||
fuser = LoggingFUSEr
|
||||
else:
|
||||
fuser = FUSEr
|
||||
|
||||
fs, url_path = url_to_fs(args.url, **kwargs)
|
||||
logger.debug("Mounting %s to %s", url_path, str(args.mount_point))
|
||||
run(
|
||||
fs,
|
||||
args.source_path,
|
||||
args.mount_point,
|
||||
foreground=args.foreground,
|
||||
threads=args.threads,
|
||||
ready_file=args.ready_file,
|
||||
ops_class=fuser,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
main(sys.argv[1:])
|
||||
411
.venv/lib/python3.10/site-packages/fsspec/generic.py
Normal file
411
.venv/lib/python3.10/site-packages/fsspec/generic.py
Normal file
@@ -0,0 +1,411 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
|
||||
from .callbacks import DEFAULT_CALLBACK
|
||||
from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
|
||||
|
||||
_generic_fs = {}
|
||||
logger = logging.getLogger("fsspec.generic")
|
||||
|
||||
|
||||
def set_generic_fs(protocol, **storage_options):
|
||||
_generic_fs[protocol] = filesystem(protocol, **storage_options)
|
||||
|
||||
|
||||
default_method = "default"
|
||||
|
||||
|
||||
def _resolve_fs(url, method=None, protocol=None, storage_options=None):
|
||||
"""Pick instance of backend FS"""
|
||||
method = method or default_method
|
||||
protocol = protocol or split_protocol(url)[0]
|
||||
storage_options = storage_options or {}
|
||||
if method == "default":
|
||||
return filesystem(protocol)
|
||||
if method == "generic":
|
||||
return _generic_fs[protocol]
|
||||
if method == "current":
|
||||
cls = get_filesystem_class(protocol)
|
||||
return cls.current()
|
||||
if method == "options":
|
||||
fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
|
||||
return fs
|
||||
raise ValueError(f"Unknown FS resolution method: {method}")
|
||||
|
||||
|
||||
def rsync(
|
||||
source,
|
||||
destination,
|
||||
delete_missing=False,
|
||||
source_field="size",
|
||||
dest_field="size",
|
||||
update_cond="different",
|
||||
inst_kwargs=None,
|
||||
fs=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Sync files between two directory trees
|
||||
|
||||
(experimental)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source: str
|
||||
Root of the directory tree to take files from. This must be a directory, but
|
||||
do not include any terminating "/" character
|
||||
destination: str
|
||||
Root path to copy into. The contents of this location should be
|
||||
identical to the contents of ``source`` when done. This will be made a
|
||||
directory, and the terminal "/" should not be included.
|
||||
delete_missing: bool
|
||||
If there are paths in the destination that don't exist in the
|
||||
source and this is True, delete them. Otherwise, leave them alone.
|
||||
source_field: str | callable
|
||||
If ``update_field`` is "different", this is the key in the info
|
||||
of source files to consider for difference. Maybe a function of the
|
||||
info dict.
|
||||
dest_field: str | callable
|
||||
If ``update_field`` is "different", this is the key in the info
|
||||
of destination files to consider for difference. May be a function of
|
||||
the info dict.
|
||||
update_cond: "different"|"always"|"never"
|
||||
If "always", every file is copied, regardless of whether it exists in
|
||||
the destination. If "never", files that exist in the destination are
|
||||
not copied again. If "different" (default), only copy if the info
|
||||
fields given by ``source_field`` and ``dest_field`` (usually "size")
|
||||
are different. Other comparisons may be added in the future.
|
||||
inst_kwargs: dict|None
|
||||
If ``fs`` is None, use this set of keyword arguments to make a
|
||||
GenericFileSystem instance
|
||||
fs: GenericFileSystem|None
|
||||
Instance to use if explicitly given. The instance defines how to
|
||||
to make downstream file system instances from paths.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict of the copy operations that were performed, {source: destination}
|
||||
"""
|
||||
fs = fs or GenericFileSystem(**(inst_kwargs or {}))
|
||||
source = fs._strip_protocol(source)
|
||||
destination = fs._strip_protocol(destination)
|
||||
allfiles = fs.find(source, withdirs=True, detail=True)
|
||||
if not fs.isdir(source):
|
||||
raise ValueError("Can only rsync on a directory")
|
||||
otherfiles = fs.find(destination, withdirs=True, detail=True)
|
||||
dirs = [
|
||||
a
|
||||
for a, v in allfiles.items()
|
||||
if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
|
||||
]
|
||||
logger.debug(f"{len(dirs)} directories to create")
|
||||
if dirs:
|
||||
fs.make_many_dirs(
|
||||
[dirn.replace(source, destination) for dirn in dirs], exist_ok=True
|
||||
)
|
||||
allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
|
||||
logger.debug(f"{len(allfiles)} files to consider for copy")
|
||||
to_delete = [
|
||||
o
|
||||
for o, v in otherfiles.items()
|
||||
if o.replace(destination, source) not in allfiles and v["type"] == "file"
|
||||
]
|
||||
for k, v in allfiles.copy().items():
|
||||
otherfile = k.replace(source, destination)
|
||||
if otherfile in otherfiles:
|
||||
if update_cond == "always":
|
||||
allfiles[k] = otherfile
|
||||
elif update_cond == "different":
|
||||
inf1 = source_field(v) if callable(source_field) else v[source_field]
|
||||
v2 = otherfiles[otherfile]
|
||||
inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
|
||||
if inf1 != inf2:
|
||||
# details mismatch, make copy
|
||||
allfiles[k] = otherfile
|
||||
else:
|
||||
# details match, don't copy
|
||||
allfiles.pop(k)
|
||||
else:
|
||||
# file not in target yet
|
||||
allfiles[k] = otherfile
|
||||
logger.debug(f"{len(allfiles)} files to copy")
|
||||
if allfiles:
|
||||
source_files, target_files = zip(*allfiles.items())
|
||||
fs.cp(source_files, target_files, **kwargs)
|
||||
logger.debug(f"{len(to_delete)} files to delete")
|
||||
if delete_missing and to_delete:
|
||||
fs.rm(to_delete)
|
||||
return allfiles
|
||||
|
||||
|
||||
class GenericFileSystem(AsyncFileSystem):
|
||||
"""Wrapper over all other FS types
|
||||
|
||||
<experimental!>
|
||||
|
||||
This implementation is a single unified interface to be able to run FS operations
|
||||
over generic URLs, and dispatch to the specific implementations using the URL
|
||||
protocol prefix.
|
||||
|
||||
Note: instances of this FS are always async, even if you never use it with any async
|
||||
backend.
|
||||
"""
|
||||
|
||||
protocol = "generic" # there is no real reason to ever use a protocol with this FS
|
||||
|
||||
def __init__(self, default_method="default", **kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
default_method: str (optional)
|
||||
Defines how to configure backend FS instances. Options are:
|
||||
- "default": instantiate like FSClass(), with no
|
||||
extra arguments; this is the default instance of that FS, and can be
|
||||
configured via the config system
|
||||
- "generic": takes instances from the `_generic_fs` dict in this module,
|
||||
which you must populate before use. Keys are by protocol
|
||||
- "current": takes the most recently instantiated version of each FS
|
||||
"""
|
||||
self.method = default_method
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def _parent(self, path):
|
||||
fs = _resolve_fs(path, self.method)
|
||||
return fs.unstrip_protocol(fs._parent(path))
|
||||
|
||||
def _strip_protocol(self, path):
|
||||
# normalization only
|
||||
fs = _resolve_fs(path, self.method)
|
||||
return fs.unstrip_protocol(fs._strip_protocol(path))
|
||||
|
||||
async def _find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
||||
fs = _resolve_fs(path, self.method)
|
||||
if fs.async_impl:
|
||||
out = await fs._find(
|
||||
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
||||
)
|
||||
else:
|
||||
out = fs.find(
|
||||
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
||||
)
|
||||
result = {}
|
||||
for k, v in out.items():
|
||||
v = v.copy() # don't corrupt target FS dircache
|
||||
name = fs.unstrip_protocol(k)
|
||||
v["name"] = name
|
||||
result[name] = v
|
||||
if detail:
|
||||
return result
|
||||
return list(result)
|
||||
|
||||
async def _info(self, url, **kwargs):
|
||||
fs = _resolve_fs(url, self.method)
|
||||
if fs.async_impl:
|
||||
out = await fs._info(url, **kwargs)
|
||||
else:
|
||||
out = fs.info(url, **kwargs)
|
||||
out = out.copy() # don't edit originals
|
||||
out["name"] = fs.unstrip_protocol(out["name"])
|
||||
return out
|
||||
|
||||
async def _ls(
|
||||
self,
|
||||
url,
|
||||
detail=True,
|
||||
**kwargs,
|
||||
):
|
||||
fs = _resolve_fs(url, self.method)
|
||||
if fs.async_impl:
|
||||
out = await fs._ls(url, detail=True, **kwargs)
|
||||
else:
|
||||
out = fs.ls(url, detail=True, **kwargs)
|
||||
out = [o.copy() for o in out] # don't edit originals
|
||||
for o in out:
|
||||
o["name"] = fs.unstrip_protocol(o["name"])
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return [o["name"] for o in out]
|
||||
|
||||
async def _cat_file(
|
||||
self,
|
||||
url,
|
||||
**kwargs,
|
||||
):
|
||||
fs = _resolve_fs(url, self.method)
|
||||
if fs.async_impl:
|
||||
return await fs._cat_file(url, **kwargs)
|
||||
else:
|
||||
return fs.cat_file(url, **kwargs)
|
||||
|
||||
async def _pipe_file(
|
||||
self,
|
||||
path,
|
||||
value,
|
||||
**kwargs,
|
||||
):
|
||||
fs = _resolve_fs(path, self.method)
|
||||
if fs.async_impl:
|
||||
return await fs._pipe_file(path, value, **kwargs)
|
||||
else:
|
||||
return fs.pipe_file(path, value, **kwargs)
|
||||
|
||||
async def _rm(self, url, **kwargs):
|
||||
urls = url
|
||||
if isinstance(urls, str):
|
||||
urls = [urls]
|
||||
fs = _resolve_fs(urls[0], self.method)
|
||||
if fs.async_impl:
|
||||
await fs._rm(urls, **kwargs)
|
||||
else:
|
||||
fs.rm(url, **kwargs)
|
||||
|
||||
async def _makedirs(self, path, exist_ok=False):
|
||||
logger.debug("Make dir %s", path)
|
||||
fs = _resolve_fs(path, self.method)
|
||||
if fs.async_impl:
|
||||
await fs._makedirs(path, exist_ok=exist_ok)
|
||||
else:
|
||||
fs.makedirs(path, exist_ok=exist_ok)
|
||||
|
||||
def rsync(self, source, destination, **kwargs):
|
||||
"""Sync files between two directory trees
|
||||
|
||||
See `func:rsync` for more details.
|
||||
"""
|
||||
rsync(source, destination, fs=self, **kwargs)
|
||||
|
||||
async def _cp_file(
|
||||
self,
|
||||
url,
|
||||
url2,
|
||||
blocksize=2**20,
|
||||
callback=DEFAULT_CALLBACK,
|
||||
**kwargs,
|
||||
):
|
||||
fs = _resolve_fs(url, self.method)
|
||||
fs2 = _resolve_fs(url2, self.method)
|
||||
if fs is fs2:
|
||||
# pure remote
|
||||
if fs.async_impl:
|
||||
return await fs._cp_file(url, url2, **kwargs)
|
||||
else:
|
||||
return fs.cp_file(url, url2, **kwargs)
|
||||
kw = {"blocksize": 0, "cache_type": "none"}
|
||||
try:
|
||||
f1 = (
|
||||
await fs.open_async(url, "rb")
|
||||
if hasattr(fs, "open_async")
|
||||
else fs.open(url, "rb", **kw)
|
||||
)
|
||||
callback.set_size(await maybe_await(f1.size))
|
||||
f2 = (
|
||||
await fs2.open_async(url2, "wb")
|
||||
if hasattr(fs2, "open_async")
|
||||
else fs2.open(url2, "wb", **kw)
|
||||
)
|
||||
while f1.size is None or f2.tell() < f1.size:
|
||||
data = await maybe_await(f1.read(blocksize))
|
||||
if f1.size is None and not data:
|
||||
break
|
||||
await maybe_await(f2.write(data))
|
||||
callback.absolute_update(f2.tell())
|
||||
finally:
|
||||
try:
|
||||
await maybe_await(f2.close())
|
||||
await maybe_await(f1.close())
|
||||
except NameError:
|
||||
# fail while opening f1 or f2
|
||||
pass
|
||||
|
||||
async def _make_many_dirs(self, urls, exist_ok=True):
|
||||
fs = _resolve_fs(urls[0], self.method)
|
||||
if fs.async_impl:
|
||||
coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
|
||||
await _run_coros_in_chunks(coros)
|
||||
else:
|
||||
for u in urls:
|
||||
fs.makedirs(u, exist_ok=exist_ok)
|
||||
|
||||
make_many_dirs = sync_wrapper(_make_many_dirs)
|
||||
|
||||
async def _copy(
|
||||
self,
|
||||
path1: list[str],
|
||||
path2: list[str],
|
||||
recursive: bool = False,
|
||||
on_error: str = "ignore",
|
||||
maxdepth: Optional[int] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
tempdir: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
if recursive:
|
||||
raise NotImplementedError
|
||||
fs = _resolve_fs(path1[0], self.method)
|
||||
fs2 = _resolve_fs(path2[0], self.method)
|
||||
# not expanding paths atm., assume call is from rsync()
|
||||
if fs is fs2:
|
||||
# pure remote
|
||||
if fs.async_impl:
|
||||
return await fs._copy(path1, path2, **kwargs)
|
||||
else:
|
||||
return fs.copy(path1, path2, **kwargs)
|
||||
await copy_file_op(
|
||||
fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
|
||||
)
|
||||
|
||||
|
||||
async def copy_file_op(
|
||||
fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
|
||||
):
|
||||
import tempfile
|
||||
|
||||
tempdir = tempdir or tempfile.mkdtemp()
|
||||
try:
|
||||
coros = [
|
||||
_copy_file_op(
|
||||
fs1,
|
||||
u1,
|
||||
fs2,
|
||||
u2,
|
||||
os.path.join(tempdir, uuid.uuid4().hex),
|
||||
on_error=on_error,
|
||||
)
|
||||
for u1, u2 in zip(url1, url2)
|
||||
]
|
||||
await _run_coros_in_chunks(coros, batch_size=batch_size)
|
||||
finally:
|
||||
shutil.rmtree(tempdir)
|
||||
|
||||
|
||||
async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
|
||||
ex = () if on_error == "raise" else Exception
|
||||
logger.debug("Copy %s -> %s", url1, url2)
|
||||
try:
|
||||
if fs1.async_impl:
|
||||
await fs1._get_file(url1, local)
|
||||
else:
|
||||
fs1.get_file(url1, local)
|
||||
if fs2.async_impl:
|
||||
await fs2._put_file(local, url2)
|
||||
else:
|
||||
fs2.put_file(local, url2)
|
||||
os.unlink(local)
|
||||
logger.debug("Copy %s -> %s; done", url1, url2)
|
||||
except ex as e:
|
||||
logger.debug("ignoring cp exception for %s: %s", url1, e)
|
||||
|
||||
|
||||
async def maybe_await(cor):
|
||||
if inspect.iscoroutine(cor):
|
||||
return await cor
|
||||
else:
|
||||
return cor
|
||||
416
.venv/lib/python3.10/site-packages/fsspec/gui.py
Normal file
416
.venv/lib/python3.10/site-packages/fsspec/gui.py
Normal file
@@ -0,0 +1,416 @@
|
||||
import ast
|
||||
import contextlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import ClassVar, Sequence
|
||||
|
||||
import panel as pn
|
||||
|
||||
from .core import OpenFile, get_filesystem_class, split_protocol
|
||||
from .registry import known_implementations
|
||||
|
||||
pn.extension()
|
||||
logger = logging.getLogger("fsspec.gui")
|
||||
|
||||
|
||||
class SigSlot:
|
||||
"""Signal-slot mixin, for Panel event passing
|
||||
|
||||
Include this class in a widget manager's superclasses to be able to
|
||||
register events and callbacks on Panel widgets managed by that class.
|
||||
|
||||
The method ``_register`` should be called as widgets are added, and external
|
||||
code should call ``connect`` to associate callbacks.
|
||||
|
||||
By default, all signals emit a DEBUG logging statement.
|
||||
"""
|
||||
|
||||
# names of signals that this class may emit each of which must be
|
||||
# set by _register for any new instance
|
||||
signals: ClassVar[Sequence[str]] = []
|
||||
# names of actions that this class may respond to
|
||||
slots: ClassVar[Sequence[str]] = []
|
||||
|
||||
# each of which must be a method name
|
||||
|
||||
def __init__(self):
|
||||
self._ignoring_events = False
|
||||
self._sigs = {}
|
||||
self._map = {}
|
||||
self._setup()
|
||||
|
||||
def _setup(self):
|
||||
"""Create GUI elements and register signals"""
|
||||
self.panel = pn.pane.PaneBase()
|
||||
# no signals to set up in the base class
|
||||
|
||||
def _register(
|
||||
self, widget, name, thing="value", log_level=logging.DEBUG, auto=False
|
||||
):
|
||||
"""Watch the given attribute of a widget and assign it a named event
|
||||
|
||||
This is normally called at the time a widget is instantiated, in the
|
||||
class which owns it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
widget : pn.layout.Panel or None
|
||||
Widget to watch. If None, an anonymous signal not associated with
|
||||
any widget.
|
||||
name : str
|
||||
Name of this event
|
||||
thing : str
|
||||
Attribute of the given widget to watch
|
||||
log_level : int
|
||||
When the signal is triggered, a logging event of the given level
|
||||
will be fired in the dfviz logger.
|
||||
auto : bool
|
||||
If True, automatically connects with a method in this class of the
|
||||
same name.
|
||||
"""
|
||||
if name not in self.signals:
|
||||
raise ValueError(f"Attempt to assign an undeclared signal: {name}")
|
||||
self._sigs[name] = {
|
||||
"widget": widget,
|
||||
"callbacks": [],
|
||||
"thing": thing,
|
||||
"log": log_level,
|
||||
}
|
||||
wn = "-".join(
|
||||
[
|
||||
getattr(widget, "name", str(widget)) if widget is not None else "none",
|
||||
thing,
|
||||
]
|
||||
)
|
||||
self._map[wn] = name
|
||||
if widget is not None:
|
||||
widget.param.watch(self._signal, thing, onlychanged=True)
|
||||
if auto and hasattr(self, name):
|
||||
self.connect(name, getattr(self, name))
|
||||
|
||||
def _repr_mimebundle_(self, *args, **kwargs):
|
||||
"""Display in a notebook or a server"""
|
||||
try:
|
||||
return self.panel._repr_mimebundle_(*args, **kwargs)
|
||||
except (ValueError, AttributeError) as exc:
|
||||
raise NotImplementedError(
|
||||
"Panel does not seem to be set up properly"
|
||||
) from exc
|
||||
|
||||
def connect(self, signal, slot):
|
||||
"""Associate call back with given event
|
||||
|
||||
The callback must be a function which takes the "new" value of the
|
||||
watched attribute as the only parameter. If the callback return False,
|
||||
this cancels any further processing of the given event.
|
||||
|
||||
Alternatively, the callback can be a string, in which case it means
|
||||
emitting the correspondingly-named event (i.e., connect to self)
|
||||
"""
|
||||
self._sigs[signal]["callbacks"].append(slot)
|
||||
|
||||
def _signal(self, event):
|
||||
"""This is called by a an action on a widget
|
||||
|
||||
Within an self.ignore_events context, nothing happens.
|
||||
|
||||
Tests can execute this method by directly changing the values of
|
||||
widget components.
|
||||
"""
|
||||
if not self._ignoring_events:
|
||||
wn = "-".join([event.obj.name, event.name])
|
||||
if wn in self._map and self._map[wn] in self._sigs:
|
||||
self._emit(self._map[wn], event.new)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def ignore_events(self):
|
||||
"""Temporarily turn off events processing in this instance
|
||||
|
||||
(does not propagate to children)
|
||||
"""
|
||||
self._ignoring_events = True
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self._ignoring_events = False
|
||||
|
||||
def _emit(self, sig, value=None):
|
||||
"""An event happened, call its callbacks
|
||||
|
||||
This method can be used in tests to simulate message passing without
|
||||
directly changing visual elements.
|
||||
|
||||
Calling of callbacks will halt whenever one returns False.
|
||||
"""
|
||||
logger.log(self._sigs[sig]["log"], f"{sig}: {value}")
|
||||
for callback in self._sigs[sig]["callbacks"]:
|
||||
if isinstance(callback, str):
|
||||
self._emit(callback)
|
||||
else:
|
||||
try:
|
||||
# running callbacks should not break the interface
|
||||
ret = callback(value)
|
||||
if ret is False:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"Exception (%s) while executing callback for signal: %s",
|
||||
e,
|
||||
sig,
|
||||
)
|
||||
|
||||
def show(self, threads=False):
|
||||
"""Open a new browser tab and display this instance's interface"""
|
||||
self.panel.show(threads=threads, verbose=False)
|
||||
return self
|
||||
|
||||
|
||||
class SingleSelect(SigSlot):
|
||||
"""A multiselect which only allows you to select one item for an event"""
|
||||
|
||||
signals = ["_selected", "selected"] # the first is internal
|
||||
slots = ["set_options", "set_selection", "add", "clear", "select"]
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
super().__init__()
|
||||
|
||||
def _setup(self):
|
||||
self.panel = pn.widgets.MultiSelect(**self.kwargs)
|
||||
self._register(self.panel, "_selected", "value")
|
||||
self._register(None, "selected")
|
||||
self.connect("_selected", self.select_one)
|
||||
|
||||
def _signal(self, *args, **kwargs):
|
||||
super()._signal(*args, **kwargs)
|
||||
|
||||
def select_one(self, *_):
|
||||
with self.ignore_events():
|
||||
val = [self.panel.value[-1]] if self.panel.value else []
|
||||
self.panel.value = val
|
||||
self._emit("selected", self.panel.value)
|
||||
|
||||
def set_options(self, options):
|
||||
self.panel.options = options
|
||||
|
||||
def clear(self):
|
||||
self.panel.options = []
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
return self.panel.value
|
||||
|
||||
def set_selection(self, selection):
|
||||
self.panel.value = [selection]
|
||||
|
||||
|
||||
class FileSelector(SigSlot):
|
||||
"""Panel-based graphical file selector widget
|
||||
|
||||
Instances of this widget are interactive and can be displayed in jupyter by having
|
||||
them as the output of a cell, or in a separate browser tab using ``.show()``.
|
||||
"""
|
||||
|
||||
signals = [
|
||||
"protocol_changed",
|
||||
"selection_changed",
|
||||
"directory_entered",
|
||||
"home_clicked",
|
||||
"up_clicked",
|
||||
"go_clicked",
|
||||
"filters_changed",
|
||||
]
|
||||
slots = ["set_filters", "go_home"]
|
||||
|
||||
def __init__(self, url=None, filters=None, ignore=None, kwargs=None):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str (optional)
|
||||
Initial value of the URL to populate the dialog; should include protocol
|
||||
filters : list(str) (optional)
|
||||
File endings to include in the listings. If not included, all files are
|
||||
allowed. Does not affect directories.
|
||||
If given, the endings will appear as checkboxes in the interface
|
||||
ignore : list(str) (optional)
|
||||
Regex(s) of file basename patterns to ignore, e.g., "\\." for typical
|
||||
hidden files on posix
|
||||
kwargs : dict (optional)
|
||||
To pass to file system instance
|
||||
"""
|
||||
if url:
|
||||
self.init_protocol, url = split_protocol(url)
|
||||
else:
|
||||
self.init_protocol, url = "file", os.getcwd()
|
||||
self.init_url = url
|
||||
self.init_kwargs = (kwargs if isinstance(kwargs, str) else str(kwargs)) or "{}"
|
||||
self.filters = filters
|
||||
self.ignore = [re.compile(i) for i in ignore or []]
|
||||
self._fs = None
|
||||
super().__init__()
|
||||
|
||||
def _setup(self):
|
||||
self.url = pn.widgets.TextInput(
|
||||
name="url",
|
||||
value=self.init_url,
|
||||
align="end",
|
||||
sizing_mode="stretch_width",
|
||||
width_policy="max",
|
||||
)
|
||||
self.protocol = pn.widgets.Select(
|
||||
options=sorted(known_implementations),
|
||||
value=self.init_protocol,
|
||||
name="protocol",
|
||||
align="center",
|
||||
)
|
||||
self.kwargs = pn.widgets.TextInput(
|
||||
name="kwargs", value=self.init_kwargs, align="center"
|
||||
)
|
||||
self.go = pn.widgets.Button(name="⇨", align="end", width=45)
|
||||
self.main = SingleSelect(size=10)
|
||||
self.home = pn.widgets.Button(name="🏠", width=40, height=30, align="end")
|
||||
self.up = pn.widgets.Button(name="‹", width=30, height=30, align="end")
|
||||
|
||||
self._register(self.protocol, "protocol_changed", auto=True)
|
||||
self._register(self.go, "go_clicked", "clicks", auto=True)
|
||||
self._register(self.up, "up_clicked", "clicks", auto=True)
|
||||
self._register(self.home, "home_clicked", "clicks", auto=True)
|
||||
self._register(None, "selection_changed")
|
||||
self.main.connect("selected", self.selection_changed)
|
||||
self._register(None, "directory_entered")
|
||||
self.prev_protocol = self.protocol.value
|
||||
self.prev_kwargs = self.storage_options
|
||||
|
||||
self.filter_sel = pn.widgets.CheckBoxGroup(
|
||||
value=[], options=[], inline=False, align="end", width_policy="min"
|
||||
)
|
||||
self._register(self.filter_sel, "filters_changed", auto=True)
|
||||
|
||||
self.panel = pn.Column(
|
||||
pn.Row(self.protocol, self.kwargs),
|
||||
pn.Row(self.home, self.up, self.url, self.go, self.filter_sel),
|
||||
self.main.panel,
|
||||
)
|
||||
self.set_filters(self.filters)
|
||||
self.go_clicked()
|
||||
|
||||
def set_filters(self, filters=None):
|
||||
self.filters = filters
|
||||
if filters:
|
||||
self.filter_sel.options = filters
|
||||
self.filter_sel.value = filters
|
||||
else:
|
||||
self.filter_sel.options = []
|
||||
self.filter_sel.value = []
|
||||
|
||||
@property
|
||||
def storage_options(self):
|
||||
"""Value of the kwargs box as a dictionary"""
|
||||
return ast.literal_eval(self.kwargs.value) or {}
|
||||
|
||||
@property
|
||||
def fs(self):
|
||||
"""Current filesystem instance"""
|
||||
if self._fs is None:
|
||||
cls = get_filesystem_class(self.protocol.value)
|
||||
self._fs = cls(**self.storage_options)
|
||||
return self._fs
|
||||
|
||||
@property
|
||||
def urlpath(self):
|
||||
"""URL of currently selected item"""
|
||||
return (
|
||||
(f"{self.protocol.value}://{self.main.value[0]}")
|
||||
if self.main.value
|
||||
else None
|
||||
)
|
||||
|
||||
def open_file(self, mode="rb", compression=None, encoding=None):
|
||||
"""Create OpenFile instance for the currently selected item
|
||||
|
||||
For example, in a notebook you might do something like
|
||||
|
||||
.. code-block::
|
||||
|
||||
[ ]: sel = FileSelector(); sel
|
||||
|
||||
# user selects their file
|
||||
|
||||
[ ]: with sel.open_file('rb') as f:
|
||||
... out = f.read()
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mode: str (optional)
|
||||
Open mode for the file.
|
||||
compression: str (optional)
|
||||
The interact with the file as compressed. Set to 'infer' to guess
|
||||
compression from the file ending
|
||||
encoding: str (optional)
|
||||
If using text mode, use this encoding; defaults to UTF8.
|
||||
"""
|
||||
if self.urlpath is None:
|
||||
raise ValueError("No file selected")
|
||||
return OpenFile(self.fs, self.urlpath, mode, compression, encoding)
|
||||
|
||||
def filters_changed(self, values):
|
||||
self.filters = values
|
||||
self.go_clicked()
|
||||
|
||||
def selection_changed(self, *_):
|
||||
if self.urlpath is None:
|
||||
return
|
||||
if self.fs.isdir(self.urlpath):
|
||||
self.url.value = self.fs._strip_protocol(self.urlpath)
|
||||
self.go_clicked()
|
||||
|
||||
def go_clicked(self, *_):
|
||||
if (
|
||||
self.prev_protocol != self.protocol.value
|
||||
or self.prev_kwargs != self.storage_options
|
||||
):
|
||||
self._fs = None # causes fs to be recreated
|
||||
self.prev_protocol = self.protocol.value
|
||||
self.prev_kwargs = self.storage_options
|
||||
listing = sorted(
|
||||
self.fs.ls(self.url.value, detail=True), key=lambda x: x["name"]
|
||||
)
|
||||
listing = [
|
||||
l
|
||||
for l in listing
|
||||
if not any(i.match(l["name"].rsplit("/", 1)[-1]) for i in self.ignore)
|
||||
]
|
||||
folders = {
|
||||
"📁 " + o["name"].rsplit("/", 1)[-1]: o["name"]
|
||||
for o in listing
|
||||
if o["type"] == "directory"
|
||||
}
|
||||
files = {
|
||||
"📄 " + o["name"].rsplit("/", 1)[-1]: o["name"]
|
||||
for o in listing
|
||||
if o["type"] == "file"
|
||||
}
|
||||
if self.filters:
|
||||
files = {
|
||||
k: v
|
||||
for k, v in files.items()
|
||||
if any(v.endswith(ext) for ext in self.filters)
|
||||
}
|
||||
self.main.set_options(dict(**folders, **files))
|
||||
|
||||
def protocol_changed(self, *_):
|
||||
self._fs = None
|
||||
self.main.options = []
|
||||
self.url.value = ""
|
||||
|
||||
def home_clicked(self, *_):
|
||||
self.protocol.value = self.init_protocol
|
||||
self.kwargs.value = self.init_kwargs
|
||||
self.url.value = self.init_url
|
||||
self.go_clicked()
|
||||
|
||||
def up_clicked(self, *_):
|
||||
self.url.value = self.fs._parent(self.url.value)
|
||||
self.go_clicked()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,304 @@
|
||||
import errno
|
||||
import io
|
||||
import os
|
||||
import secrets
|
||||
import shutil
|
||||
from contextlib import suppress
|
||||
from functools import cached_property, wraps
|
||||
from urllib.parse import parse_qs
|
||||
|
||||
from fsspec.spec import AbstractFileSystem
|
||||
from fsspec.utils import (
|
||||
get_package_version_without_import,
|
||||
infer_storage_options,
|
||||
mirror_from,
|
||||
tokenize,
|
||||
)
|
||||
|
||||
|
||||
def wrap_exceptions(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except OSError as exception:
|
||||
if not exception.args:
|
||||
raise
|
||||
|
||||
message, *args = exception.args
|
||||
if isinstance(message, str) and "does not exist" in message:
|
||||
raise FileNotFoundError(errno.ENOENT, message) from exception
|
||||
else:
|
||||
raise
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
PYARROW_VERSION = None
|
||||
|
||||
|
||||
class ArrowFSWrapper(AbstractFileSystem):
|
||||
"""FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fs : pyarrow.fs.FileSystem
|
||||
|
||||
"""
|
||||
|
||||
root_marker = "/"
|
||||
|
||||
def __init__(self, fs, **kwargs):
|
||||
global PYARROW_VERSION
|
||||
PYARROW_VERSION = get_package_version_without_import("pyarrow")
|
||||
self.fs = fs
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@property
|
||||
def protocol(self):
|
||||
return self.fs.type_name
|
||||
|
||||
@cached_property
|
||||
def fsid(self):
|
||||
return "hdfs_" + tokenize(self.fs.host, self.fs.port)
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
ops = infer_storage_options(path)
|
||||
path = ops["path"]
|
||||
if path.startswith("//"):
|
||||
# special case for "hdfs://path" (without the triple slash)
|
||||
path = path[1:]
|
||||
return path
|
||||
|
||||
def ls(self, path, detail=False, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
from pyarrow.fs import FileSelector
|
||||
|
||||
entries = [
|
||||
self._make_entry(entry)
|
||||
for entry in self.fs.get_file_info(FileSelector(path))
|
||||
]
|
||||
if detail:
|
||||
return entries
|
||||
else:
|
||||
return [entry["name"] for entry in entries]
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
[info] = self.fs.get_file_info([path])
|
||||
return self._make_entry(info)
|
||||
|
||||
def exists(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
self.info(path)
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _make_entry(self, info):
|
||||
from pyarrow.fs import FileType
|
||||
|
||||
if info.type is FileType.Directory:
|
||||
kind = "directory"
|
||||
elif info.type is FileType.File:
|
||||
kind = "file"
|
||||
elif info.type is FileType.NotFound:
|
||||
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
|
||||
else:
|
||||
kind = "other"
|
||||
|
||||
return {
|
||||
"name": info.path,
|
||||
"size": info.size,
|
||||
"type": kind,
|
||||
"mtime": info.mtime,
|
||||
}
|
||||
|
||||
@wrap_exceptions
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1).rstrip("/")
|
||||
path2 = self._strip_protocol(path2).rstrip("/")
|
||||
|
||||
with self._open(path1, "rb") as lstream:
|
||||
tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
|
||||
try:
|
||||
with self.open(tmp_fname, "wb") as rstream:
|
||||
shutil.copyfileobj(lstream, rstream)
|
||||
self.fs.move(tmp_fname, path2)
|
||||
except BaseException:
|
||||
with suppress(FileNotFoundError):
|
||||
self.fs.delete_file(tmp_fname)
|
||||
raise
|
||||
|
||||
@wrap_exceptions
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1).rstrip("/")
|
||||
path2 = self._strip_protocol(path2).rstrip("/")
|
||||
self.fs.move(path1, path2)
|
||||
|
||||
@wrap_exceptions
|
||||
def rm_file(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.fs.delete_file(path)
|
||||
|
||||
@wrap_exceptions
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
path = self._strip_protocol(path).rstrip("/")
|
||||
if self.isdir(path):
|
||||
if recursive:
|
||||
self.fs.delete_dir(path)
|
||||
else:
|
||||
raise ValueError("Can't delete directories without recursive=False")
|
||||
else:
|
||||
self.fs.delete_file(path)
|
||||
|
||||
@wrap_exceptions
|
||||
def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
|
||||
if mode == "rb":
|
||||
if seekable:
|
||||
method = self.fs.open_input_file
|
||||
else:
|
||||
method = self.fs.open_input_stream
|
||||
elif mode == "wb":
|
||||
method = self.fs.open_output_stream
|
||||
elif mode == "ab":
|
||||
method = self.fs.open_append_stream
|
||||
else:
|
||||
raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
|
||||
|
||||
_kwargs = {}
|
||||
if mode != "rb" or not seekable:
|
||||
if int(PYARROW_VERSION.split(".")[0]) >= 4:
|
||||
# disable compression auto-detection
|
||||
_kwargs["compression"] = None
|
||||
stream = method(path, **_kwargs)
|
||||
|
||||
return ArrowFile(self, stream, path, mode, block_size, **kwargs)
|
||||
|
||||
@wrap_exceptions
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if create_parents:
|
||||
self.makedirs(path, exist_ok=True)
|
||||
else:
|
||||
self.fs.create_dir(path, recursive=False)
|
||||
|
||||
@wrap_exceptions
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
path = self._strip_protocol(path)
|
||||
self.fs.create_dir(path, recursive=True)
|
||||
|
||||
@wrap_exceptions
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.fs.delete_dir(path)
|
||||
|
||||
@wrap_exceptions
|
||||
def modified(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return self.fs.get_file_info(path).mtime
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
kwargs["seekable"] = start not in [None, 0]
|
||||
return super().cat_file(path, start=None, end=None, **kwargs)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
kwargs["seekable"] = False
|
||||
super().get_file(rpath, lpath, **kwargs)
|
||||
|
||||
|
||||
@mirror_from(
|
||||
"stream",
|
||||
[
|
||||
"read",
|
||||
"seek",
|
||||
"tell",
|
||||
"write",
|
||||
"readable",
|
||||
"writable",
|
||||
"close",
|
||||
"size",
|
||||
"seekable",
|
||||
],
|
||||
)
|
||||
class ArrowFile(io.IOBase):
|
||||
def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
|
||||
self.path = path
|
||||
self.mode = mode
|
||||
|
||||
self.fs = fs
|
||||
self.stream = stream
|
||||
|
||||
self.blocksize = self.block_size = block_size
|
||||
self.kwargs = kwargs
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
return self.close()
|
||||
|
||||
|
||||
class HadoopFileSystem(ArrowFSWrapper):
|
||||
"""A wrapper on top of the pyarrow.fs.HadoopFileSystem
|
||||
to connect it's interface with fsspec"""
|
||||
|
||||
protocol = "hdfs"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host="default",
|
||||
port=0,
|
||||
user=None,
|
||||
kerb_ticket=None,
|
||||
replication=3,
|
||||
extra_conf=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
Hostname, IP or "default" to try to read from Hadoop config
|
||||
port: int
|
||||
Port to connect on, or default from Hadoop config if 0
|
||||
user: str or None
|
||||
If given, connect as this username
|
||||
kerb_ticket: str or None
|
||||
If given, use this ticket for authentication
|
||||
replication: int
|
||||
set replication factor of file for write operations. default value is 3.
|
||||
extra_conf: None or dict
|
||||
Passed on to HadoopFileSystem
|
||||
"""
|
||||
from pyarrow.fs import HadoopFileSystem
|
||||
|
||||
fs = HadoopFileSystem(
|
||||
host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
kerb_ticket=kerb_ticket,
|
||||
replication=replication,
|
||||
extra_conf=extra_conf,
|
||||
)
|
||||
super().__init__(fs=fs, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
ops = infer_storage_options(path)
|
||||
out = {}
|
||||
if ops.get("host", None):
|
||||
out["host"] = ops["host"]
|
||||
if ops.get("username", None):
|
||||
out["user"] = ops["username"]
|
||||
if ops.get("port", None):
|
||||
out["port"] = ops["port"]
|
||||
if ops.get("url_query", None):
|
||||
queries = parse_qs(ops["url_query"])
|
||||
if queries.get("replication", None):
|
||||
out["replication"] = int(queries["replication"][0])
|
||||
return out
|
||||
@@ -0,0 +1,103 @@
|
||||
import asyncio
|
||||
import functools
|
||||
import inspect
|
||||
|
||||
from fsspec.asyn import AsyncFileSystem, running_async
|
||||
|
||||
|
||||
def async_wrapper(func, obj=None):
|
||||
"""
|
||||
Wraps a synchronous function to make it awaitable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : callable
|
||||
The synchronous function to wrap.
|
||||
obj : object, optional
|
||||
The instance to bind the function to, if applicable.
|
||||
|
||||
Returns
|
||||
-------
|
||||
coroutine
|
||||
An awaitable version of the function.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
return await asyncio.to_thread(func, *args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
class AsyncFileSystemWrapper(AsyncFileSystem):
|
||||
"""
|
||||
A wrapper class to convert a synchronous filesystem into an asynchronous one.
|
||||
|
||||
This class takes an existing synchronous filesystem implementation and wraps all
|
||||
its methods to provide an asynchronous interface.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sync_fs : AbstractFileSystem
|
||||
The synchronous filesystem instance to wrap.
|
||||
"""
|
||||
|
||||
protocol = "async_wrapper"
|
||||
cachable = False
|
||||
|
||||
def __init__(self, fs, *args, asynchronous=None, **kwargs):
|
||||
if asynchronous is None:
|
||||
asynchronous = running_async()
|
||||
super().__init__(*args, asynchronous=asynchronous, **kwargs)
|
||||
self.sync_fs = fs
|
||||
self.protocol = self.sync_fs.protocol
|
||||
self._wrap_all_sync_methods()
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return f"async_{self.sync_fs.fsid}"
|
||||
|
||||
def _wrap_all_sync_methods(self):
|
||||
"""
|
||||
Wrap all synchronous methods of the underlying filesystem with asynchronous versions.
|
||||
"""
|
||||
excluded_methods = {"open"}
|
||||
for method_name in dir(self.sync_fs):
|
||||
if method_name.startswith("_") or method_name in excluded_methods:
|
||||
continue
|
||||
|
||||
attr = inspect.getattr_static(self.sync_fs, method_name)
|
||||
if isinstance(attr, property):
|
||||
continue
|
||||
|
||||
method = getattr(self.sync_fs, method_name)
|
||||
if callable(method) and not asyncio.iscoroutinefunction(method):
|
||||
async_method = async_wrapper(method, obj=self)
|
||||
setattr(self, f"_{method_name}", async_method)
|
||||
|
||||
@classmethod
|
||||
def wrap_class(cls, sync_fs_class):
|
||||
"""
|
||||
Create a new class that can be used to instantiate an AsyncFileSystemWrapper
|
||||
with lazy instantiation of the underlying synchronous filesystem.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sync_fs_class : type
|
||||
The class of the synchronous filesystem to wrap.
|
||||
|
||||
Returns
|
||||
-------
|
||||
type
|
||||
A new class that wraps the provided synchronous filesystem class.
|
||||
"""
|
||||
|
||||
class GeneratedAsyncFileSystemWrapper(cls):
|
||||
def __init__(self, *args, **kwargs):
|
||||
sync_fs = sync_fs_class(*args, **kwargs)
|
||||
super().__init__(sync_fs)
|
||||
|
||||
GeneratedAsyncFileSystemWrapper.__name__ = (
|
||||
f"Async{sync_fs_class.__name__}Wrapper"
|
||||
)
|
||||
return GeneratedAsyncFileSystemWrapper
|
||||
@@ -0,0 +1,75 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
import hashlib
|
||||
|
||||
from fsspec.implementations.local import make_path_posix
|
||||
|
||||
|
||||
class AbstractCacheMapper(abc.ABC):
|
||||
"""Abstract super-class for mappers from remote URLs to local cached
|
||||
basenames.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def __call__(self, path: str) -> str: ...
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
# Identity only depends on class. When derived classes have attributes
|
||||
# they will need to be included.
|
||||
return isinstance(other, type(self))
|
||||
|
||||
def __hash__(self) -> int:
|
||||
# Identity only depends on class. When derived classes have attributes
|
||||
# they will need to be included.
|
||||
return hash(type(self))
|
||||
|
||||
|
||||
class BasenameCacheMapper(AbstractCacheMapper):
|
||||
"""Cache mapper that uses the basename of the remote URL and a fixed number
|
||||
of directory levels above this.
|
||||
|
||||
The default is zero directory levels, meaning different paths with the same
|
||||
basename will have the same cached basename.
|
||||
"""
|
||||
|
||||
def __init__(self, directory_levels: int = 0):
|
||||
if directory_levels < 0:
|
||||
raise ValueError(
|
||||
"BasenameCacheMapper requires zero or positive directory_levels"
|
||||
)
|
||||
self.directory_levels = directory_levels
|
||||
|
||||
# Separator for directories when encoded as strings.
|
||||
self._separator = "_@_"
|
||||
|
||||
def __call__(self, path: str) -> str:
|
||||
path = make_path_posix(path)
|
||||
prefix, *bits = path.rsplit("/", self.directory_levels + 1)
|
||||
if bits:
|
||||
return self._separator.join(bits)
|
||||
else:
|
||||
return prefix # No separator found, simple filename
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
return super().__eq__(other) and self.directory_levels == other.directory_levels
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return super().__hash__() ^ hash(self.directory_levels)
|
||||
|
||||
|
||||
class HashCacheMapper(AbstractCacheMapper):
|
||||
"""Cache mapper that uses a hash of the remote URL."""
|
||||
|
||||
def __call__(self, path: str) -> str:
|
||||
return hashlib.sha256(path.encode()).hexdigest()
|
||||
|
||||
|
||||
def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
|
||||
"""Factory method to create cache mapper for backward compatibility with
|
||||
``CachingFileSystem`` constructor using ``same_names`` kwarg.
|
||||
"""
|
||||
if same_names:
|
||||
return BasenameCacheMapper()
|
||||
else:
|
||||
return HashCacheMapper()
|
||||
@@ -0,0 +1,232 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from fsspec.utils import atomic_write
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
if not TYPE_CHECKING:
|
||||
import json
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any, Dict, Iterator, Literal
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from .cached import CachingFileSystem
|
||||
|
||||
Detail: TypeAlias = Dict[str, Any]
|
||||
|
||||
|
||||
class CacheMetadata:
|
||||
"""Cache metadata.
|
||||
|
||||
All reading and writing of cache metadata is performed by this class,
|
||||
accessing the cached files and blocks is not.
|
||||
|
||||
Metadata is stored in a single file per storage directory in JSON format.
|
||||
For backward compatibility, also reads metadata stored in pickle format
|
||||
which is converted to JSON when next saved.
|
||||
"""
|
||||
|
||||
def __init__(self, storage: list[str]):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
storage: list[str]
|
||||
Directories containing cached files, must be at least one. Metadata
|
||||
is stored in the last of these directories by convention.
|
||||
"""
|
||||
if not storage:
|
||||
raise ValueError("CacheMetadata expects at least one storage location")
|
||||
|
||||
self._storage = storage
|
||||
self.cached_files: list[Detail] = [{}]
|
||||
|
||||
# Private attribute to force saving of metadata in pickle format rather than
|
||||
# JSON for use in tests to confirm can read both pickle and JSON formats.
|
||||
self._force_save_pickle = False
|
||||
|
||||
def _load(self, fn: str) -> Detail:
|
||||
"""Low-level function to load metadata from specific file"""
|
||||
try:
|
||||
with open(fn, "r") as f:
|
||||
loaded = json.load(f)
|
||||
except ValueError:
|
||||
with open(fn, "rb") as f:
|
||||
loaded = pickle.load(f)
|
||||
for c in loaded.values():
|
||||
if isinstance(c.get("blocks"), list):
|
||||
c["blocks"] = set(c["blocks"])
|
||||
return loaded
|
||||
|
||||
def _save(self, metadata_to_save: Detail, fn: str) -> None:
|
||||
"""Low-level function to save metadata to specific file"""
|
||||
if self._force_save_pickle:
|
||||
with atomic_write(fn) as f:
|
||||
pickle.dump(metadata_to_save, f)
|
||||
else:
|
||||
with atomic_write(fn, mode="w") as f:
|
||||
json.dump(metadata_to_save, f)
|
||||
|
||||
def _scan_locations(
|
||||
self, writable_only: bool = False
|
||||
) -> Iterator[tuple[str, str, bool]]:
|
||||
"""Yield locations (filenames) where metadata is stored, and whether
|
||||
writable or not.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
writable: bool
|
||||
Set to True to only yield writable locations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Yields (str, str, bool)
|
||||
"""
|
||||
n = len(self._storage)
|
||||
for i, storage in enumerate(self._storage):
|
||||
writable = i == n - 1
|
||||
if writable_only and not writable:
|
||||
continue
|
||||
yield os.path.join(storage, "cache"), storage, writable
|
||||
|
||||
def check_file(
|
||||
self, path: str, cfs: CachingFileSystem | None
|
||||
) -> Literal[False] | tuple[Detail, str]:
|
||||
"""If path is in cache return its details, otherwise return ``False``.
|
||||
|
||||
If the optional CachingFileSystem is specified then it is used to
|
||||
perform extra checks to reject possible matches, such as if they are
|
||||
too old.
|
||||
"""
|
||||
for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
|
||||
if path not in cache:
|
||||
continue
|
||||
detail = cache[path].copy()
|
||||
|
||||
if cfs is not None:
|
||||
if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
|
||||
# Wrong file as determined by hash of file properties
|
||||
continue
|
||||
if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
|
||||
# Cached file has expired
|
||||
continue
|
||||
|
||||
fn = os.path.join(base, detail["fn"])
|
||||
if os.path.exists(fn):
|
||||
return detail, fn
|
||||
return False
|
||||
|
||||
def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
|
||||
"""Remove expired metadata from the cache.
|
||||
|
||||
Returns names of files corresponding to expired metadata and a boolean
|
||||
flag indicating whether the writable cache is empty. Caller is
|
||||
responsible for deleting the expired files.
|
||||
"""
|
||||
expired_files = []
|
||||
for path, detail in self.cached_files[-1].copy().items():
|
||||
if time.time() - detail["time"] > expiry_time:
|
||||
fn = detail.get("fn", "")
|
||||
if not fn:
|
||||
raise RuntimeError(
|
||||
f"Cache metadata does not contain 'fn' for {path}"
|
||||
)
|
||||
fn = os.path.join(self._storage[-1], fn)
|
||||
expired_files.append(fn)
|
||||
self.cached_files[-1].pop(path)
|
||||
|
||||
if self.cached_files[-1]:
|
||||
cache_path = os.path.join(self._storage[-1], "cache")
|
||||
self._save(self.cached_files[-1], cache_path)
|
||||
|
||||
writable_cache_empty = not self.cached_files[-1]
|
||||
return expired_files, writable_cache_empty
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load all metadata from disk and store in ``self.cached_files``"""
|
||||
cached_files = []
|
||||
for fn, _, _ in self._scan_locations():
|
||||
if os.path.exists(fn):
|
||||
# TODO: consolidate blocks here
|
||||
cached_files.append(self._load(fn))
|
||||
else:
|
||||
cached_files.append({})
|
||||
self.cached_files = cached_files or [{}]
|
||||
|
||||
def on_close_cached_file(self, f: Any, path: str) -> None:
|
||||
"""Perform side-effect actions on closing a cached file.
|
||||
|
||||
The actual closing of the file is the responsibility of the caller.
|
||||
"""
|
||||
# File must be writeble, so in self.cached_files[-1]
|
||||
c = self.cached_files[-1][path]
|
||||
if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
|
||||
c["blocks"] = True
|
||||
|
||||
def pop_file(self, path: str) -> str | None:
|
||||
"""Remove metadata of cached file.
|
||||
|
||||
If path is in the cache, return the filename of the cached file,
|
||||
otherwise return ``None``. Caller is responsible for deleting the
|
||||
cached file.
|
||||
"""
|
||||
details = self.check_file(path, None)
|
||||
if not details:
|
||||
return None
|
||||
_, fn = details
|
||||
if fn.startswith(self._storage[-1]):
|
||||
self.cached_files[-1].pop(path)
|
||||
self.save()
|
||||
else:
|
||||
raise PermissionError(
|
||||
"Can only delete cached file in last, writable cache location"
|
||||
)
|
||||
return fn
|
||||
|
||||
def save(self) -> None:
|
||||
"""Save metadata to disk"""
|
||||
for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
|
||||
if not writable:
|
||||
continue
|
||||
|
||||
if os.path.exists(fn):
|
||||
cached_files = self._load(fn)
|
||||
for k, c in cached_files.items():
|
||||
if k in cache:
|
||||
if c["blocks"] is True or cache[k]["blocks"] is True:
|
||||
c["blocks"] = True
|
||||
else:
|
||||
# self.cached_files[*][*]["blocks"] must continue to
|
||||
# point to the same set object so that updates
|
||||
# performed by MMapCache are propagated back to
|
||||
# self.cached_files.
|
||||
blocks = cache[k]["blocks"]
|
||||
blocks.update(c["blocks"])
|
||||
c["blocks"] = blocks
|
||||
c["time"] = max(c["time"], cache[k]["time"])
|
||||
c["uid"] = cache[k]["uid"]
|
||||
|
||||
# Files can be added to cache after it was written once
|
||||
for k, c in cache.items():
|
||||
if k not in cached_files:
|
||||
cached_files[k] = c
|
||||
else:
|
||||
cached_files = cache
|
||||
cache = {k: v.copy() for k, v in cached_files.items()}
|
||||
for c in cache.values():
|
||||
if isinstance(c["blocks"], set):
|
||||
c["blocks"] = list(c["blocks"])
|
||||
self._save(cache, fn)
|
||||
self.cached_files[-1] = cached_files
|
||||
|
||||
def update_file(self, path: str, detail: Detail) -> None:
|
||||
"""Update metadata for specific file in memory, do not save"""
|
||||
self.cached_files[-1][path] = detail
|
||||
@@ -0,0 +1,941 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
import weakref
|
||||
from shutil import rmtree
|
||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar
|
||||
|
||||
from fsspec import AbstractFileSystem, filesystem
|
||||
from fsspec.callbacks import DEFAULT_CALLBACK
|
||||
from fsspec.compression import compr
|
||||
from fsspec.core import BaseCache, MMapCache
|
||||
from fsspec.exceptions import BlocksizeMismatchError
|
||||
from fsspec.implementations.cache_mapper import create_cache_mapper
|
||||
from fsspec.implementations.cache_metadata import CacheMetadata
|
||||
from fsspec.spec import AbstractBufferedFile
|
||||
from fsspec.transaction import Transaction
|
||||
from fsspec.utils import infer_compression
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fsspec.implementations.cache_mapper import AbstractCacheMapper
|
||||
|
||||
logger = logging.getLogger("fsspec.cached")
|
||||
|
||||
|
||||
class WriteCachedTransaction(Transaction):
|
||||
def complete(self, commit=True):
|
||||
rpaths = [f.path for f in self.files]
|
||||
lpaths = [f.fn for f in self.files]
|
||||
if commit:
|
||||
self.fs.put(lpaths, rpaths)
|
||||
self.files.clear()
|
||||
self.fs._intrans = False
|
||||
self.fs._transaction = None
|
||||
self.fs = None # break cycle
|
||||
|
||||
|
||||
class CachingFileSystem(AbstractFileSystem):
|
||||
"""Locally caching filesystem, layer over any other FS
|
||||
|
||||
This class implements chunk-wise local storage of remote files, for quick
|
||||
access after the initial download. The files are stored in a given
|
||||
directory with hashes of URLs for the filenames. If no directory is given,
|
||||
a temporary one is used, which should be cleaned up by the OS after the
|
||||
process ends. The files themselves are sparse (as implemented in
|
||||
:class:`~fsspec.caching.MMapCache`), so only the data which is accessed
|
||||
takes up space.
|
||||
|
||||
Restrictions:
|
||||
|
||||
- the block-size must be the same for each access of a given file, unless
|
||||
all blocks of the file have already been read
|
||||
- caching can only be applied to file-systems which produce files
|
||||
derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
|
||||
allowed, for testing
|
||||
"""
|
||||
|
||||
protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
target_protocol=None,
|
||||
cache_storage="TMP",
|
||||
cache_check=10,
|
||||
check_files=False,
|
||||
expiry_time=604800,
|
||||
target_options=None,
|
||||
fs=None,
|
||||
same_names: bool | None = None,
|
||||
compression=None,
|
||||
cache_mapper: AbstractCacheMapper | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_protocol: str (optional)
|
||||
Target filesystem protocol. Provide either this or ``fs``.
|
||||
cache_storage: str or list(str)
|
||||
Location to store files. If "TMP", this is a temporary directory,
|
||||
and will be cleaned up by the OS when this process ends (or later).
|
||||
If a list, each location will be tried in the order given, but
|
||||
only the last will be considered writable.
|
||||
cache_check: int
|
||||
Number of seconds between reload of cache metadata
|
||||
check_files: bool
|
||||
Whether to explicitly see if the UID of the remote file matches
|
||||
the stored one before using. Warning: some file systems such as
|
||||
HTTP cannot reliably give a unique hash of the contents of some
|
||||
path, so be sure to set this option to False.
|
||||
expiry_time: int
|
||||
The time in seconds after which a local copy is considered useless.
|
||||
Set to falsy to prevent expiry. The default is equivalent to one
|
||||
week.
|
||||
target_options: dict or None
|
||||
Passed to the instantiation of the FS, if fs is None.
|
||||
fs: filesystem instance
|
||||
The target filesystem to run against. Provide this or ``protocol``.
|
||||
same_names: bool (optional)
|
||||
By default, target URLs are hashed using a ``HashCacheMapper`` so
|
||||
that files from different backends with the same basename do not
|
||||
conflict. If this argument is ``true``, a ``BasenameCacheMapper``
|
||||
is used instead. Other cache mapper options are available by using
|
||||
the ``cache_mapper`` keyword argument. Only one of this and
|
||||
``cache_mapper`` should be specified.
|
||||
compression: str (optional)
|
||||
To decompress on download. Can be 'infer' (guess from the URL name),
|
||||
one of the entries in ``fsspec.compression.compr``, or None for no
|
||||
decompression.
|
||||
cache_mapper: AbstractCacheMapper (optional)
|
||||
The object use to map from original filenames to cached filenames.
|
||||
Only one of this and ``same_names`` should be specified.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
if fs is None and target_protocol is None:
|
||||
raise ValueError(
|
||||
"Please provide filesystem instance(fs) or target_protocol"
|
||||
)
|
||||
if not (fs is None) ^ (target_protocol is None):
|
||||
raise ValueError(
|
||||
"Both filesystems (fs) and target_protocol may not be both given."
|
||||
)
|
||||
if cache_storage == "TMP":
|
||||
tempdir = tempfile.mkdtemp()
|
||||
storage = [tempdir]
|
||||
weakref.finalize(self, self._remove_tempdir, tempdir)
|
||||
else:
|
||||
if isinstance(cache_storage, str):
|
||||
storage = [cache_storage]
|
||||
else:
|
||||
storage = cache_storage
|
||||
os.makedirs(storage[-1], exist_ok=True)
|
||||
self.storage = storage
|
||||
self.kwargs = target_options or {}
|
||||
self.cache_check = cache_check
|
||||
self.check_files = check_files
|
||||
self.expiry = expiry_time
|
||||
self.compression = compression
|
||||
|
||||
# Size of cache in bytes. If None then the size is unknown and will be
|
||||
# recalculated the next time cache_size() is called. On writes to the
|
||||
# cache this is reset to None.
|
||||
self._cache_size = None
|
||||
|
||||
if same_names is not None and cache_mapper is not None:
|
||||
raise ValueError(
|
||||
"Cannot specify both same_names and cache_mapper in "
|
||||
"CachingFileSystem.__init__"
|
||||
)
|
||||
if cache_mapper is not None:
|
||||
self._mapper = cache_mapper
|
||||
else:
|
||||
self._mapper = create_cache_mapper(
|
||||
same_names if same_names is not None else False
|
||||
)
|
||||
|
||||
self.target_protocol = (
|
||||
target_protocol
|
||||
if isinstance(target_protocol, str)
|
||||
else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
|
||||
)
|
||||
self._metadata = CacheMetadata(self.storage)
|
||||
self.load_cache()
|
||||
self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
|
||||
|
||||
def _strip_protocol(path):
|
||||
# acts as a method, since each instance has a difference target
|
||||
return self.fs._strip_protocol(type(self)._strip_protocol(path))
|
||||
|
||||
self._strip_protocol: Callable = _strip_protocol
|
||||
|
||||
@staticmethod
|
||||
def _remove_tempdir(tempdir):
|
||||
try:
|
||||
rmtree(tempdir)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _mkcache(self):
|
||||
os.makedirs(self.storage[-1], exist_ok=True)
|
||||
|
||||
def cache_size(self):
|
||||
"""Return size of cache in bytes.
|
||||
|
||||
If more than one cache directory is in use, only the size of the last
|
||||
one (the writable cache directory) is returned.
|
||||
"""
|
||||
if self._cache_size is None:
|
||||
cache_dir = self.storage[-1]
|
||||
self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
|
||||
return self._cache_size
|
||||
|
||||
def load_cache(self):
|
||||
"""Read set of stored blocks from file"""
|
||||
self._metadata.load()
|
||||
self._mkcache()
|
||||
self.last_cache = time.time()
|
||||
|
||||
def save_cache(self):
|
||||
"""Save set of stored blocks from file"""
|
||||
self._mkcache()
|
||||
self._metadata.save()
|
||||
self.last_cache = time.time()
|
||||
self._cache_size = None
|
||||
|
||||
def _check_cache(self):
|
||||
"""Reload caches if time elapsed or any disappeared"""
|
||||
self._mkcache()
|
||||
if not self.cache_check:
|
||||
# explicitly told not to bother checking
|
||||
return
|
||||
timecond = time.time() - self.last_cache > self.cache_check
|
||||
existcond = all(os.path.exists(storage) for storage in self.storage)
|
||||
if timecond or not existcond:
|
||||
self.load_cache()
|
||||
|
||||
def _check_file(self, path):
|
||||
"""Is path in cache and still valid"""
|
||||
path = self._strip_protocol(path)
|
||||
self._check_cache()
|
||||
return self._metadata.check_file(path, self)
|
||||
|
||||
def clear_cache(self):
|
||||
"""Remove all files and metadata from the cache
|
||||
|
||||
In the case of multiple cache locations, this clears only the last one,
|
||||
which is assumed to be the read/write one.
|
||||
"""
|
||||
rmtree(self.storage[-1])
|
||||
self.load_cache()
|
||||
self._cache_size = None
|
||||
|
||||
def clear_expired_cache(self, expiry_time=None):
|
||||
"""Remove all expired files and metadata from the cache
|
||||
|
||||
In the case of multiple cache locations, this clears only the last one,
|
||||
which is assumed to be the read/write one.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
expiry_time: int
|
||||
The time in seconds after which a local copy is considered useless.
|
||||
If not defined the default is equivalent to the attribute from the
|
||||
file caching instantiation.
|
||||
"""
|
||||
|
||||
if not expiry_time:
|
||||
expiry_time = self.expiry
|
||||
|
||||
self._check_cache()
|
||||
|
||||
expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
|
||||
for fn in expired_files:
|
||||
if os.path.exists(fn):
|
||||
os.remove(fn)
|
||||
|
||||
if writable_cache_empty:
|
||||
rmtree(self.storage[-1])
|
||||
self.load_cache()
|
||||
|
||||
self._cache_size = None
|
||||
|
||||
def pop_from_cache(self, path):
|
||||
"""Remove cached version of given file
|
||||
|
||||
Deletes local copy of the given (remote) path. If it is found in a cache
|
||||
location which is not the last, it is assumed to be read-only, and
|
||||
raises PermissionError
|
||||
"""
|
||||
path = self._strip_protocol(path)
|
||||
fn = self._metadata.pop_file(path)
|
||||
if fn is not None:
|
||||
os.remove(fn)
|
||||
self._cache_size = None
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Wrap the target _open
|
||||
|
||||
If the whole file exists in the cache, just open it locally and
|
||||
return that.
|
||||
|
||||
Otherwise, open the file on the target FS, and make it have a mmap
|
||||
cache pointing to the location which we determine, in our cache.
|
||||
The ``blocks`` instance is shared, so as the mmap cache instance
|
||||
updates, so does the entry in our ``cached_files`` attribute.
|
||||
We monkey-patch this file, so that when it closes, we call
|
||||
``close_and_update`` to save the state of the blocks.
|
||||
"""
|
||||
path = self._strip_protocol(path)
|
||||
|
||||
path = self.fs._strip_protocol(path)
|
||||
if "r" not in mode:
|
||||
return self.fs._open(
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
detail = self._check_file(path)
|
||||
if detail:
|
||||
# file is in cache
|
||||
detail, fn = detail
|
||||
hash, blocks = detail["fn"], detail["blocks"]
|
||||
if blocks is True:
|
||||
# stored file is complete
|
||||
logger.debug("Opening local copy of %s", path)
|
||||
return open(fn, mode)
|
||||
# TODO: action where partial file exists in read-only cache
|
||||
logger.debug("Opening partially cached copy of %s", path)
|
||||
else:
|
||||
hash = self._mapper(path)
|
||||
fn = os.path.join(self.storage[-1], hash)
|
||||
blocks = set()
|
||||
detail = {
|
||||
"original": path,
|
||||
"fn": hash,
|
||||
"blocks": blocks,
|
||||
"time": time.time(),
|
||||
"uid": self.fs.ukey(path),
|
||||
}
|
||||
self._metadata.update_file(path, detail)
|
||||
logger.debug("Creating local sparse file for %s", path)
|
||||
|
||||
# call target filesystems open
|
||||
self._mkcache()
|
||||
f = self.fs._open(
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
cache_type="none",
|
||||
**kwargs,
|
||||
)
|
||||
if self.compression:
|
||||
comp = (
|
||||
infer_compression(path)
|
||||
if self.compression == "infer"
|
||||
else self.compression
|
||||
)
|
||||
f = compr[comp](f, mode="rb")
|
||||
if "blocksize" in detail:
|
||||
if detail["blocksize"] != f.blocksize:
|
||||
raise BlocksizeMismatchError(
|
||||
f"Cached file must be reopened with same block"
|
||||
f" size as original (old: {detail['blocksize']},"
|
||||
f" new {f.blocksize})"
|
||||
)
|
||||
else:
|
||||
detail["blocksize"] = f.blocksize
|
||||
|
||||
def _fetch_ranges(ranges):
|
||||
return self.fs.cat_ranges(
|
||||
[path] * len(ranges),
|
||||
[r[0] for r in ranges],
|
||||
[r[1] for r in ranges],
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
multi_fetcher = None if self.compression else _fetch_ranges
|
||||
f.cache = MMapCache(
|
||||
f.blocksize, f._fetch_range, f.size, fn, blocks, multi_fetcher=multi_fetcher
|
||||
)
|
||||
close = f.close
|
||||
f.close = lambda: self.close_and_update(f, close)
|
||||
self.save_cache()
|
||||
return f
|
||||
|
||||
def _parent(self, path):
|
||||
return self.fs._parent(path)
|
||||
|
||||
def hash_name(self, path: str, *args: Any) -> str:
|
||||
# Kept for backward compatibility with downstream libraries.
|
||||
# Ignores extra arguments, previously same_name boolean.
|
||||
return self._mapper(path)
|
||||
|
||||
def close_and_update(self, f, close):
|
||||
"""Called when a file is closing, so store the set of blocks"""
|
||||
if f.closed:
|
||||
return
|
||||
path = self._strip_protocol(f.path)
|
||||
self._metadata.on_close_cached_file(f, path)
|
||||
try:
|
||||
logger.debug("going to save")
|
||||
self.save_cache()
|
||||
logger.debug("saved")
|
||||
except OSError:
|
||||
logger.debug("Cache saving failed while closing file")
|
||||
except NameError:
|
||||
logger.debug("Cache save failed due to interpreter shutdown")
|
||||
close()
|
||||
f.closed = True
|
||||
|
||||
def ls(self, path, detail=True):
|
||||
return self.fs.ls(path, detail)
|
||||
|
||||
def __getattribute__(self, item):
|
||||
if item in {
|
||||
"load_cache",
|
||||
"_open",
|
||||
"save_cache",
|
||||
"close_and_update",
|
||||
"__init__",
|
||||
"__getattribute__",
|
||||
"__reduce__",
|
||||
"_make_local_details",
|
||||
"open",
|
||||
"cat",
|
||||
"cat_file",
|
||||
"cat_ranges",
|
||||
"get",
|
||||
"read_block",
|
||||
"tail",
|
||||
"head",
|
||||
"info",
|
||||
"ls",
|
||||
"exists",
|
||||
"isfile",
|
||||
"isdir",
|
||||
"_check_file",
|
||||
"_check_cache",
|
||||
"_mkcache",
|
||||
"clear_cache",
|
||||
"clear_expired_cache",
|
||||
"pop_from_cache",
|
||||
"local_file",
|
||||
"_paths_from_path",
|
||||
"get_mapper",
|
||||
"open_many",
|
||||
"commit_many",
|
||||
"hash_name",
|
||||
"__hash__",
|
||||
"__eq__",
|
||||
"to_json",
|
||||
"to_dict",
|
||||
"cache_size",
|
||||
"pipe_file",
|
||||
"pipe",
|
||||
"start_transaction",
|
||||
"end_transaction",
|
||||
}:
|
||||
# all the methods defined in this class. Note `open` here, since
|
||||
# it calls `_open`, but is actually in superclass
|
||||
return lambda *args, **kw: getattr(type(self), item).__get__(self)(
|
||||
*args, **kw
|
||||
)
|
||||
if item in ["__reduce_ex__"]:
|
||||
raise AttributeError
|
||||
if item in ["transaction"]:
|
||||
# property
|
||||
return type(self).transaction.__get__(self)
|
||||
if item in ["_cache", "transaction_type"]:
|
||||
# class attributes
|
||||
return getattr(type(self), item)
|
||||
if item == "__class__":
|
||||
return type(self)
|
||||
d = object.__getattribute__(self, "__dict__")
|
||||
fs = d.get("fs", None) # fs is not immediately defined
|
||||
if item in d:
|
||||
return d[item]
|
||||
elif fs is not None:
|
||||
if item in fs.__dict__:
|
||||
# attribute of instance
|
||||
return fs.__dict__[item]
|
||||
# attributed belonging to the target filesystem
|
||||
cls = type(fs)
|
||||
m = getattr(cls, item)
|
||||
if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
|
||||
not hasattr(m, "__self__") or m.__self__ is None
|
||||
):
|
||||
# instance method
|
||||
return m.__get__(fs, cls)
|
||||
return m # class method or attribute
|
||||
else:
|
||||
# attributes of the superclass, while target is being set up
|
||||
return super().__getattribute__(item)
|
||||
|
||||
def __eq__(self, other):
|
||||
"""Test for equality."""
|
||||
if self is other:
|
||||
return True
|
||||
if not isinstance(other, type(self)):
|
||||
return False
|
||||
return (
|
||||
self.storage == other.storage
|
||||
and self.kwargs == other.kwargs
|
||||
and self.cache_check == other.cache_check
|
||||
and self.check_files == other.check_files
|
||||
and self.expiry == other.expiry
|
||||
and self.compression == other.compression
|
||||
and self._mapper == other._mapper
|
||||
and self.target_protocol == other.target_protocol
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
"""Calculate hash."""
|
||||
return (
|
||||
hash(tuple(self.storage))
|
||||
^ hash(str(self.kwargs))
|
||||
^ hash(self.cache_check)
|
||||
^ hash(self.check_files)
|
||||
^ hash(self.expiry)
|
||||
^ hash(self.compression)
|
||||
^ hash(self._mapper)
|
||||
^ hash(self.target_protocol)
|
||||
)
|
||||
|
||||
|
||||
class WholeFileCacheFileSystem(CachingFileSystem):
|
||||
"""Caches whole remote files on first access
|
||||
|
||||
This class is intended as a layer over any other file system, and
|
||||
will make a local copy of each file accessed, so that all subsequent
|
||||
reads are local. This is similar to ``CachingFileSystem``, but without
|
||||
the block-wise functionality and so can work even when sparse files
|
||||
are not allowed. See its docstring for definition of the init
|
||||
arguments.
|
||||
|
||||
The class still needs access to the remote store for listing files,
|
||||
and may refresh cached files.
|
||||
"""
|
||||
|
||||
protocol = "filecache"
|
||||
local_file = True
|
||||
|
||||
def open_many(self, open_files, **kwargs):
|
||||
paths = [of.path for of in open_files]
|
||||
if "r" in open_files.mode:
|
||||
self._mkcache()
|
||||
else:
|
||||
return [
|
||||
LocalTempFile(
|
||||
self.fs,
|
||||
path,
|
||||
mode=open_files.mode,
|
||||
fn=os.path.join(self.storage[-1], self._mapper(path)),
|
||||
**kwargs,
|
||||
)
|
||||
for path in paths
|
||||
]
|
||||
|
||||
if self.compression:
|
||||
raise NotImplementedError
|
||||
details = [self._check_file(sp) for sp in paths]
|
||||
downpath = [p for p, d in zip(paths, details) if not d]
|
||||
downfn0 = [
|
||||
os.path.join(self.storage[-1], self._mapper(p))
|
||||
for p, d in zip(paths, details)
|
||||
] # keep these path names for opening later
|
||||
downfn = [fn for fn, d in zip(downfn0, details) if not d]
|
||||
if downpath:
|
||||
# skip if all files are already cached and up to date
|
||||
self.fs.get(downpath, downfn)
|
||||
|
||||
# update metadata - only happens when downloads are successful
|
||||
newdetail = [
|
||||
{
|
||||
"original": path,
|
||||
"fn": self._mapper(path),
|
||||
"blocks": True,
|
||||
"time": time.time(),
|
||||
"uid": self.fs.ukey(path),
|
||||
}
|
||||
for path in downpath
|
||||
]
|
||||
for path, detail in zip(downpath, newdetail):
|
||||
self._metadata.update_file(path, detail)
|
||||
self.save_cache()
|
||||
|
||||
def firstpart(fn):
|
||||
# helper to adapt both whole-file and simple-cache
|
||||
return fn[1] if isinstance(fn, tuple) else fn
|
||||
|
||||
return [
|
||||
open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode)
|
||||
for fn0, fn1 in zip(details, downfn0)
|
||||
]
|
||||
|
||||
def commit_many(self, open_files):
|
||||
self.fs.put([f.fn for f in open_files], [f.path for f in open_files])
|
||||
[f.close() for f in open_files]
|
||||
for f in open_files:
|
||||
# in case autocommit is off, and so close did not already delete
|
||||
try:
|
||||
os.remove(f.name)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
self._cache_size = None
|
||||
|
||||
def _make_local_details(self, path):
|
||||
hash = self._mapper(path)
|
||||
fn = os.path.join(self.storage[-1], hash)
|
||||
detail = {
|
||||
"original": path,
|
||||
"fn": hash,
|
||||
"blocks": True,
|
||||
"time": time.time(),
|
||||
"uid": self.fs.ukey(path),
|
||||
}
|
||||
self._metadata.update_file(path, detail)
|
||||
logger.debug("Copying %s to local cache", path)
|
||||
return fn
|
||||
|
||||
def cat(
|
||||
self,
|
||||
path,
|
||||
recursive=False,
|
||||
on_error="raise",
|
||||
callback=DEFAULT_CALLBACK,
|
||||
**kwargs,
|
||||
):
|
||||
paths = self.expand_path(
|
||||
path, recursive=recursive, maxdepth=kwargs.get("maxdepth")
|
||||
)
|
||||
getpaths = []
|
||||
storepaths = []
|
||||
fns = []
|
||||
out = {}
|
||||
for p in paths.copy():
|
||||
try:
|
||||
detail = self._check_file(p)
|
||||
if not detail:
|
||||
fn = self._make_local_details(p)
|
||||
getpaths.append(p)
|
||||
storepaths.append(fn)
|
||||
else:
|
||||
detail, fn = detail if isinstance(detail, tuple) else (None, detail)
|
||||
fns.append(fn)
|
||||
except Exception as e:
|
||||
if on_error == "raise":
|
||||
raise
|
||||
if on_error == "return":
|
||||
out[p] = e
|
||||
paths.remove(p)
|
||||
|
||||
if getpaths:
|
||||
self.fs.get(getpaths, storepaths)
|
||||
self.save_cache()
|
||||
|
||||
callback.set_size(len(paths))
|
||||
for p, fn in zip(paths, fns):
|
||||
with open(fn, "rb") as f:
|
||||
out[p] = f.read()
|
||||
callback.relative_update(1)
|
||||
if isinstance(path, str) and len(paths) == 1 and recursive is False:
|
||||
out = out[paths[0]]
|
||||
return out
|
||||
|
||||
def _open(self, path, mode="rb", **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if "r" not in mode:
|
||||
hash = self._mapper(path)
|
||||
fn = os.path.join(self.storage[-1], hash)
|
||||
user_specified_kwargs = {
|
||||
k: v
|
||||
for k, v in kwargs.items()
|
||||
# those kwargs were added by open(), we don't want them
|
||||
if k not in ["autocommit", "block_size", "cache_options"]
|
||||
}
|
||||
return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs)
|
||||
detail = self._check_file(path)
|
||||
if detail:
|
||||
detail, fn = detail
|
||||
_, blocks = detail["fn"], detail["blocks"]
|
||||
if blocks is True:
|
||||
logger.debug("Opening local copy of %s", path)
|
||||
|
||||
# In order to support downstream filesystems to be able to
|
||||
# infer the compression from the original filename, like
|
||||
# the `TarFileSystem`, let's extend the `io.BufferedReader`
|
||||
# fileobject protocol by adding a dedicated attribute
|
||||
# `original`.
|
||||
f = open(fn, mode)
|
||||
f.original = detail.get("original")
|
||||
return f
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Attempt to open partially cached file {path}"
|
||||
f" as a wholly cached file"
|
||||
)
|
||||
else:
|
||||
fn = self._make_local_details(path)
|
||||
kwargs["mode"] = mode
|
||||
|
||||
# call target filesystems open
|
||||
self._mkcache()
|
||||
if self.compression:
|
||||
with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
|
||||
if isinstance(f, AbstractBufferedFile):
|
||||
# want no type of caching if just downloading whole thing
|
||||
f.cache = BaseCache(0, f.cache.fetcher, f.size)
|
||||
comp = (
|
||||
infer_compression(path)
|
||||
if self.compression == "infer"
|
||||
else self.compression
|
||||
)
|
||||
f = compr[comp](f, mode="rb")
|
||||
data = True
|
||||
while data:
|
||||
block = getattr(f, "blocksize", 5 * 2**20)
|
||||
data = f.read(block)
|
||||
f2.write(data)
|
||||
else:
|
||||
self.fs.get_file(path, fn)
|
||||
self.save_cache()
|
||||
return self._open(path, mode)
|
||||
|
||||
|
||||
class SimpleCacheFileSystem(WholeFileCacheFileSystem):
|
||||
"""Caches whole remote files on first access
|
||||
|
||||
This class is intended as a layer over any other file system, and
|
||||
will make a local copy of each file accessed, so that all subsequent
|
||||
reads are local. This implementation only copies whole files, and
|
||||
does not keep any metadata about the download time or file details.
|
||||
It is therefore safer to use in multi-threaded/concurrent situations.
|
||||
|
||||
This is the only of the caching filesystems that supports write: you will
|
||||
be given a real local open file, and upon close and commit, it will be
|
||||
uploaded to the target filesystem; the writability or the target URL is
|
||||
not checked until that time.
|
||||
|
||||
"""
|
||||
|
||||
protocol = "simplecache"
|
||||
local_file = True
|
||||
transaction_type = WriteCachedTransaction
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
kw = kwargs.copy()
|
||||
for key in ["cache_check", "expiry_time", "check_files"]:
|
||||
kw[key] = False
|
||||
super().__init__(**kw)
|
||||
for storage in self.storage:
|
||||
if not os.path.exists(storage):
|
||||
os.makedirs(storage, exist_ok=True)
|
||||
|
||||
def _check_file(self, path):
|
||||
self._check_cache()
|
||||
sha = self._mapper(path)
|
||||
for storage in self.storage:
|
||||
fn = os.path.join(storage, sha)
|
||||
if os.path.exists(fn):
|
||||
return fn
|
||||
|
||||
def save_cache(self):
|
||||
pass
|
||||
|
||||
def load_cache(self):
|
||||
pass
|
||||
|
||||
def pipe_file(self, path, value=None, **kwargs):
|
||||
if self._intrans:
|
||||
with self.open(path, "wb") as f:
|
||||
f.write(value)
|
||||
else:
|
||||
super().pipe_file(path, value)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
details = []
|
||||
try:
|
||||
details = self.fs.ls(
|
||||
path, detail=True, **kwargs
|
||||
).copy() # don't edit original!
|
||||
except FileNotFoundError as e:
|
||||
ex = e
|
||||
else:
|
||||
ex = None
|
||||
if self._intrans:
|
||||
path1 = path.rstrip("/") + "/"
|
||||
for f in self.transaction.files:
|
||||
if f.path == path:
|
||||
details.append(
|
||||
{"name": path, "size": f.size or f.tell(), "type": "file"}
|
||||
)
|
||||
elif f.path.startswith(path1):
|
||||
if f.path.count("/") == path1.count("/"):
|
||||
details.append(
|
||||
{"name": f.path, "size": f.size or f.tell(), "type": "file"}
|
||||
)
|
||||
else:
|
||||
dname = "/".join(f.path.split("/")[: path1.count("/") + 1])
|
||||
details.append({"name": dname, "size": 0, "type": "directory"})
|
||||
if ex is not None and not details:
|
||||
raise ex
|
||||
if detail:
|
||||
return details
|
||||
return sorted(_["name"] for _ in details)
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self._intrans:
|
||||
f = [_ for _ in self.transaction.files if _.path == path]
|
||||
if f:
|
||||
size = os.path.getsize(f[0].fn) if f[0].closed else f[0].tell()
|
||||
return {"name": path, "size": size, "type": "file"}
|
||||
f = any(_.path.startswith(path + "/") for _ in self.transaction.files)
|
||||
if f:
|
||||
return {"name": path, "size": 0, "type": "directory"}
|
||||
return self.fs.info(path, **kwargs)
|
||||
|
||||
def pipe(self, path, value=None, **kwargs):
|
||||
if isinstance(path, str):
|
||||
self.pipe_file(self._strip_protocol(path), value, **kwargs)
|
||||
elif isinstance(path, dict):
|
||||
for k, v in path.items():
|
||||
self.pipe_file(self._strip_protocol(k), v, **kwargs)
|
||||
else:
|
||||
raise ValueError("path must be str or dict")
|
||||
|
||||
def cat_ranges(
|
||||
self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
|
||||
):
|
||||
lpaths = [self._check_file(p) for p in paths]
|
||||
rpaths = [p for l, p in zip(lpaths, paths) if l is False]
|
||||
lpaths = [l for l, p in zip(lpaths, paths) if l is False]
|
||||
self.fs.get(rpaths, lpaths)
|
||||
return super().cat_ranges(
|
||||
paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
|
||||
)
|
||||
|
||||
def _open(self, path, mode="rb", **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
sha = self._mapper(path)
|
||||
|
||||
if "r" not in mode:
|
||||
fn = os.path.join(self.storage[-1], sha)
|
||||
user_specified_kwargs = {
|
||||
k: v
|
||||
for k, v in kwargs.items()
|
||||
if k not in ["autocommit", "block_size", "cache_options"]
|
||||
} # those were added by open()
|
||||
return LocalTempFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
autocommit=not self._intrans,
|
||||
fn=fn,
|
||||
**user_specified_kwargs,
|
||||
)
|
||||
fn = self._check_file(path)
|
||||
if fn:
|
||||
return open(fn, mode)
|
||||
|
||||
fn = os.path.join(self.storage[-1], sha)
|
||||
logger.debug("Copying %s to local cache", path)
|
||||
kwargs["mode"] = mode
|
||||
|
||||
self._mkcache()
|
||||
self._cache_size = None
|
||||
if self.compression:
|
||||
with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
|
||||
if isinstance(f, AbstractBufferedFile):
|
||||
# want no type of caching if just downloading whole thing
|
||||
f.cache = BaseCache(0, f.cache.fetcher, f.size)
|
||||
comp = (
|
||||
infer_compression(path)
|
||||
if self.compression == "infer"
|
||||
else self.compression
|
||||
)
|
||||
f = compr[comp](f, mode="rb")
|
||||
data = True
|
||||
while data:
|
||||
block = getattr(f, "blocksize", 5 * 2**20)
|
||||
data = f.read(block)
|
||||
f2.write(data)
|
||||
else:
|
||||
self.fs.get_file(path, fn)
|
||||
return self._open(path, mode)
|
||||
|
||||
|
||||
class LocalTempFile:
|
||||
"""A temporary local file, which will be uploaded on commit"""
|
||||
|
||||
def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0, **kwargs):
|
||||
self.fn = fn
|
||||
self.fh = open(fn, mode)
|
||||
self.mode = mode
|
||||
if seek:
|
||||
self.fh.seek(seek)
|
||||
self.path = path
|
||||
self.size = None
|
||||
self.fs = fs
|
||||
self.closed = False
|
||||
self.autocommit = autocommit
|
||||
self.kwargs = kwargs
|
||||
|
||||
def __reduce__(self):
|
||||
# always open in r+b to allow continuing writing at a location
|
||||
return (
|
||||
LocalTempFile,
|
||||
(self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()),
|
||||
)
|
||||
|
||||
def __enter__(self):
|
||||
return self.fh
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
# self.size = self.fh.tell()
|
||||
if self.closed:
|
||||
return
|
||||
self.fh.close()
|
||||
self.closed = True
|
||||
if self.autocommit:
|
||||
self.commit()
|
||||
|
||||
def discard(self):
|
||||
self.fh.close()
|
||||
os.remove(self.fn)
|
||||
|
||||
def commit(self):
|
||||
self.fs.put(self.fn, self.path, **self.kwargs)
|
||||
# we do not delete local copy - it's still in the cache
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return self.fn
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"LocalTempFile: {self.path}"
|
||||
|
||||
def __getattr__(self, item):
|
||||
return getattr(self.fh, item)
|
||||
@@ -0,0 +1,152 @@
|
||||
import dask
|
||||
from distributed.client import Client, _get_global_client
|
||||
from distributed.worker import Worker
|
||||
|
||||
from fsspec import filesystem
|
||||
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from fsspec.utils import infer_storage_options
|
||||
|
||||
|
||||
def _get_client(client):
|
||||
if client is None:
|
||||
return _get_global_client()
|
||||
elif isinstance(client, Client):
|
||||
return client
|
||||
else:
|
||||
# e.g., connection string
|
||||
return Client(client)
|
||||
|
||||
|
||||
def _in_worker():
|
||||
return bool(Worker._instances)
|
||||
|
||||
|
||||
class DaskWorkerFileSystem(AbstractFileSystem):
|
||||
"""View files accessible to a worker as any other remote file-system
|
||||
|
||||
When instances are run on the worker, uses the real filesystem. When
|
||||
run on the client, they call the worker to provide information or data.
|
||||
|
||||
**Warning** this implementation is experimental, and read-only for now.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if not (fs is None) ^ (target_protocol is None):
|
||||
raise ValueError(
|
||||
"Please provide one of filesystem instance (fs) or"
|
||||
" target_protocol, not both"
|
||||
)
|
||||
self.target_protocol = target_protocol
|
||||
self.target_options = target_options
|
||||
self.worker = None
|
||||
self.client = client
|
||||
self.fs = fs
|
||||
self._determine_worker()
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
so = infer_storage_options(path)
|
||||
if "host" in so and "port" in so:
|
||||
return {"client": f"{so['host']}:{so['port']}"}
|
||||
else:
|
||||
return {}
|
||||
|
||||
def _determine_worker(self):
|
||||
if _in_worker():
|
||||
self.worker = True
|
||||
if self.fs is None:
|
||||
self.fs = filesystem(
|
||||
self.target_protocol, **(self.target_options or {})
|
||||
)
|
||||
else:
|
||||
self.worker = False
|
||||
self.client = _get_client(self.client)
|
||||
self.rfs = dask.delayed(self)
|
||||
|
||||
def mkdir(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.mkdir(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.mkdir(*args, **kwargs).compute()
|
||||
|
||||
def rm(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.rm(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.rm(*args, **kwargs).compute()
|
||||
|
||||
def copy(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.copy(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.copy(*args, **kwargs).compute()
|
||||
|
||||
def mv(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
self.fs.mv(*args, **kwargs)
|
||||
else:
|
||||
self.rfs.mv(*args, **kwargs).compute()
|
||||
|
||||
def ls(self, *args, **kwargs):
|
||||
if self.worker:
|
||||
return self.fs.ls(*args, **kwargs)
|
||||
else:
|
||||
return self.rfs.ls(*args, **kwargs).compute()
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
if self.worker:
|
||||
return self.fs._open(
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
return DaskFile(
|
||||
fs=self,
|
||||
path=path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def fetch_range(self, path, mode, start, end):
|
||||
if self.worker:
|
||||
with self._open(path, mode) as f:
|
||||
f.seek(start)
|
||||
return f.read(end - start)
|
||||
else:
|
||||
return self.rfs.fetch_range(path, mode, start, end).compute()
|
||||
|
||||
|
||||
class DaskFile(AbstractBufferedFile):
|
||||
def __init__(self, mode="rb", **kwargs):
|
||||
if mode != "rb":
|
||||
raise ValueError('Remote dask files can only be opened in "rb" mode')
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
pass
|
||||
|
||||
def _initiate_upload(self):
|
||||
"""Create remote file/upload"""
|
||||
pass
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Get the specified set of bytes from remote"""
|
||||
return self.fs.fetch_range(self.path, self.mode, start, end)
|
||||
@@ -0,0 +1,58 @@
|
||||
import base64
|
||||
import io
|
||||
from typing import Optional
|
||||
from urllib.parse import unquote
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
|
||||
|
||||
class DataFileSystem(AbstractFileSystem):
|
||||
"""A handy decoder for data-URLs
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> with fsspec.open("data:,Hello%2C%20World%21") as f:
|
||||
... print(f.read())
|
||||
b"Hello, World!"
|
||||
|
||||
See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
|
||||
"""
|
||||
|
||||
protocol = "data"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""No parameters for this filesystem"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
pref, data = path.split(",", 1)
|
||||
if pref.endswith("base64"):
|
||||
return base64.b64decode(data)[start:end]
|
||||
return unquote(data).encode()[start:end]
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
pref, name = path.split(",", 1)
|
||||
data = self.cat_file(path)
|
||||
mime = pref.split(":", 1)[1].split(";", 1)[0]
|
||||
return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
if "r" not in mode:
|
||||
raise ValueError("Read only filesystem")
|
||||
return io.BytesIO(self.cat_file(path))
|
||||
|
||||
@staticmethod
|
||||
def encode(data: bytes, mime: Optional[str] = None):
|
||||
"""Format the given data into data-URL syntax
|
||||
|
||||
This version always base64 encodes, even when the data is ascii/url-safe.
|
||||
"""
|
||||
return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"
|
||||
@@ -0,0 +1,467 @@
|
||||
import base64
|
||||
import urllib
|
||||
|
||||
import requests
|
||||
import requests.exceptions
|
||||
from requests.adapters import HTTPAdapter, Retry
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.spec import AbstractBufferedFile
|
||||
|
||||
|
||||
class DatabricksException(Exception):
|
||||
"""
|
||||
Helper class for exceptions raised in this module.
|
||||
"""
|
||||
|
||||
def __init__(self, error_code, message):
|
||||
"""Create a new DatabricksException"""
|
||||
super().__init__(message)
|
||||
|
||||
self.error_code = error_code
|
||||
self.message = message
|
||||
|
||||
|
||||
class DatabricksFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
Get access to the Databricks filesystem implementation over HTTP.
|
||||
Can be used inside and outside of a databricks cluster.
|
||||
"""
|
||||
|
||||
def __init__(self, instance, token, **kwargs):
|
||||
"""
|
||||
Create a new DatabricksFileSystem.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
instance: str
|
||||
The instance URL of the databricks cluster.
|
||||
For example for an Azure databricks cluster, this
|
||||
has the form adb-<some-number>.<two digits>.azuredatabricks.net.
|
||||
token: str
|
||||
Your personal token. Find out more
|
||||
here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
|
||||
"""
|
||||
self.instance = instance
|
||||
self.token = token
|
||||
self.session = requests.Session()
|
||||
self.retries = Retry(
|
||||
total=10,
|
||||
backoff_factor=0.05,
|
||||
status_forcelist=[408, 429, 500, 502, 503, 504],
|
||||
)
|
||||
|
||||
self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
|
||||
self.session.headers.update({"Authorization": f"Bearer {self.token}"})
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
"""
|
||||
List the contents of the given path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path
|
||||
detail: bool
|
||||
Return not only the list of filenames,
|
||||
but also additional information on file sizes
|
||||
and types.
|
||||
"""
|
||||
out = self._ls_from_cache(path)
|
||||
if not out:
|
||||
try:
|
||||
r = self._send_to_api(
|
||||
method="get", endpoint="list", json={"path": path}
|
||||
)
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
|
||||
raise
|
||||
files = r["files"]
|
||||
out = [
|
||||
{
|
||||
"name": o["path"],
|
||||
"type": "directory" if o["is_dir"] else "file",
|
||||
"size": o["file_size"],
|
||||
}
|
||||
for o in files
|
||||
]
|
||||
self.dircache[path] = out
|
||||
|
||||
if detail:
|
||||
return out
|
||||
return [o["name"] for o in out]
|
||||
|
||||
def makedirs(self, path, exist_ok=True):
|
||||
"""
|
||||
Create a given absolute path and all of its parents.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path to create
|
||||
exist_ok: bool
|
||||
If false, checks if the folder
|
||||
exists before creating it (and raises an
|
||||
Exception if this is the case)
|
||||
"""
|
||||
if not exist_ok:
|
||||
try:
|
||||
# If the following succeeds, the path is already present
|
||||
self._send_to_api(
|
||||
method="get", endpoint="get-status", json={"path": path}
|
||||
)
|
||||
raise FileExistsError(f"Path {path} already exists")
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
pass
|
||||
|
||||
try:
|
||||
self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
||||
raise FileExistsError(e.message) from e
|
||||
|
||||
raise
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
"""
|
||||
Create a given absolute path and all of its parents.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path to create
|
||||
create_parents: bool
|
||||
Whether to create all parents or not.
|
||||
"False" is not implemented so far.
|
||||
"""
|
||||
if not create_parents:
|
||||
raise NotImplementedError
|
||||
|
||||
self.mkdirs(path, **kwargs)
|
||||
|
||||
def rm(self, path, recursive=False, **kwargs):
|
||||
"""
|
||||
Remove the file or folder at the given absolute path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path what to remove
|
||||
recursive: bool
|
||||
Recursively delete all files in a folder.
|
||||
"""
|
||||
try:
|
||||
self._send_to_api(
|
||||
method="post",
|
||||
endpoint="delete",
|
||||
json={"path": path, "recursive": recursive},
|
||||
)
|
||||
except DatabricksException as e:
|
||||
# This is not really an exception, it just means
|
||||
# not everything was deleted so far
|
||||
if e.error_code == "PARTIAL_DELETE":
|
||||
self.rm(path=path, recursive=recursive)
|
||||
elif e.error_code == "IO_ERROR":
|
||||
# Using the same exception as the os module would use here
|
||||
raise OSError(e.message) from e
|
||||
|
||||
raise
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def mv(
|
||||
self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
|
||||
):
|
||||
"""
|
||||
Move a source to a destination path.
|
||||
|
||||
A note from the original [databricks API manual]
|
||||
(https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
|
||||
|
||||
When moving a large number of files the API call will time out after
|
||||
approximately 60s, potentially resulting in partially moved data.
|
||||
Therefore, for operations that move more than 10k files, we strongly
|
||||
discourage using the DBFS REST API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source_path: str
|
||||
From where to move (absolute path)
|
||||
destination_path: str
|
||||
To where to move (absolute path)
|
||||
recursive: bool
|
||||
Not implemented to far.
|
||||
maxdepth:
|
||||
Not implemented to far.
|
||||
"""
|
||||
if recursive:
|
||||
raise NotImplementedError
|
||||
if maxdepth:
|
||||
raise NotImplementedError
|
||||
|
||||
try:
|
||||
self._send_to_api(
|
||||
method="post",
|
||||
endpoint="move",
|
||||
json={"source_path": source_path, "destination_path": destination_path},
|
||||
)
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
elif e.error_code == "RESOURCE_ALREADY_EXISTS":
|
||||
raise FileExistsError(e.message) from e
|
||||
|
||||
raise
|
||||
self.invalidate_cache(self._parent(source_path))
|
||||
self.invalidate_cache(self._parent(destination_path))
|
||||
|
||||
def _open(self, path, mode="rb", block_size="default", **kwargs):
|
||||
"""
|
||||
Overwrite the base class method to make sure to create a DBFile.
|
||||
All arguments are copied from the base method.
|
||||
|
||||
Only the default blocksize is allowed.
|
||||
"""
|
||||
return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
|
||||
|
||||
def _send_to_api(self, method, endpoint, json):
|
||||
"""
|
||||
Send the given json to the DBFS API
|
||||
using a get or post request (specified by the argument `method`).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
method: str
|
||||
Which http method to use for communication; "get" or "post".
|
||||
endpoint: str
|
||||
Where to send the request to (last part of the API URL)
|
||||
json: dict
|
||||
Dictionary of information to send
|
||||
"""
|
||||
if method == "post":
|
||||
session_call = self.session.post
|
||||
elif method == "get":
|
||||
session_call = self.session.get
|
||||
else:
|
||||
raise ValueError(f"Do not understand method {method}")
|
||||
|
||||
url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
|
||||
|
||||
r = session_call(url, json=json)
|
||||
|
||||
# The DBFS API will return a json, also in case of an exception.
|
||||
# We want to preserve this information as good as possible.
|
||||
try:
|
||||
r.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
# try to extract json error message
|
||||
# if that fails, fall back to the original exception
|
||||
try:
|
||||
exception_json = e.response.json()
|
||||
except Exception:
|
||||
raise e from None
|
||||
|
||||
raise DatabricksException(**exception_json) from e
|
||||
|
||||
return r.json()
|
||||
|
||||
def _create_handle(self, path, overwrite=True):
|
||||
"""
|
||||
Internal function to create a handle, which can be used to
|
||||
write blocks of a file to DBFS.
|
||||
A handle has a unique identifier which needs to be passed
|
||||
whenever written during this transaction.
|
||||
The handle is active for 10 minutes - after that a new
|
||||
write transaction needs to be created.
|
||||
Make sure to close the handle after you are finished.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path for this file.
|
||||
overwrite: bool
|
||||
If a file already exist at this location, either overwrite
|
||||
it or raise an exception.
|
||||
"""
|
||||
try:
|
||||
r = self._send_to_api(
|
||||
method="post",
|
||||
endpoint="create",
|
||||
json={"path": path, "overwrite": overwrite},
|
||||
)
|
||||
return r["handle"]
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_ALREADY_EXISTS":
|
||||
raise FileExistsError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def _close_handle(self, handle):
|
||||
"""
|
||||
Close a handle, which was opened by :func:`_create_handle`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
handle: str
|
||||
Which handle to close.
|
||||
"""
|
||||
try:
|
||||
self._send_to_api(method="post", endpoint="close", json={"handle": handle})
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def _add_data(self, handle, data):
|
||||
"""
|
||||
Upload data to an already opened file handle
|
||||
(opened by :func:`_create_handle`).
|
||||
The maximal allowed data size is 1MB after
|
||||
conversion to base64.
|
||||
Remember to close the handle when you are finished.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
handle: str
|
||||
Which handle to upload data to.
|
||||
data: bytes
|
||||
Block of data to add to the handle.
|
||||
"""
|
||||
data = base64.b64encode(data).decode()
|
||||
try:
|
||||
self._send_to_api(
|
||||
method="post",
|
||||
endpoint="add-block",
|
||||
json={"handle": handle, "data": data},
|
||||
)
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
|
||||
raise ValueError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def _get_data(self, path, start, end):
|
||||
"""
|
||||
Download data in bytes from a given absolute path in a block
|
||||
from [start, start+length].
|
||||
The maximum number of allowed bytes to read is 1MB.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Absolute path to download data from
|
||||
start: int
|
||||
Start position of the block
|
||||
end: int
|
||||
End position of the block
|
||||
"""
|
||||
try:
|
||||
r = self._send_to_api(
|
||||
method="get",
|
||||
endpoint="read",
|
||||
json={"path": path, "offset": start, "length": end - start},
|
||||
)
|
||||
return base64.b64decode(r["data"])
|
||||
except DatabricksException as e:
|
||||
if e.error_code == "RESOURCE_DOES_NOT_EXIST":
|
||||
raise FileNotFoundError(e.message) from e
|
||||
elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
|
||||
raise ValueError(e.message) from e
|
||||
|
||||
raise
|
||||
|
||||
def invalidate_cache(self, path=None):
|
||||
if path is None:
|
||||
self.dircache.clear()
|
||||
else:
|
||||
self.dircache.pop(path, None)
|
||||
super().invalidate_cache(path)
|
||||
|
||||
|
||||
class DatabricksFile(AbstractBufferedFile):
|
||||
"""
|
||||
Helper class for files referenced in the DatabricksFileSystem.
|
||||
"""
|
||||
|
||||
DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size="default",
|
||||
autocommit=True,
|
||||
cache_type="readahead",
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Create a new instance of the DatabricksFile.
|
||||
|
||||
The blocksize needs to be the default one.
|
||||
"""
|
||||
if block_size is None or block_size == "default":
|
||||
block_size = self.DEFAULT_BLOCK_SIZE
|
||||
|
||||
assert block_size == self.DEFAULT_BLOCK_SIZE, (
|
||||
f"Only the default block size is allowed, not {block_size}"
|
||||
)
|
||||
|
||||
super().__init__(
|
||||
fs,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options or {},
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _initiate_upload(self):
|
||||
"""Internal function to start a file upload"""
|
||||
self.handle = self.fs._create_handle(self.path)
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
"""Internal function to add a chunk of data to a started upload"""
|
||||
self.buffer.seek(0)
|
||||
data = self.buffer.getvalue()
|
||||
|
||||
data_chunks = [
|
||||
data[start:end] for start, end in self._to_sized_blocks(len(data))
|
||||
]
|
||||
|
||||
for data_chunk in data_chunks:
|
||||
self.fs._add_data(handle=self.handle, data=data_chunk)
|
||||
|
||||
if final:
|
||||
self.fs._close_handle(handle=self.handle)
|
||||
return True
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Internal function to download a block of data"""
|
||||
return_buffer = b""
|
||||
length = end - start
|
||||
for chunk_start, chunk_end in self._to_sized_blocks(length, start):
|
||||
return_buffer += self.fs._get_data(
|
||||
path=self.path, start=chunk_start, end=chunk_end
|
||||
)
|
||||
|
||||
return return_buffer
|
||||
|
||||
def _to_sized_blocks(self, length, start=0):
|
||||
"""Helper function to split a range from 0 to total_length into bloksizes"""
|
||||
end = start + length
|
||||
for data_chunk in range(start, end, self.blocksize):
|
||||
data_start = data_chunk
|
||||
data_end = min(end, data_chunk + self.blocksize)
|
||||
yield data_start, data_end
|
||||
@@ -0,0 +1,388 @@
|
||||
from .. import filesystem
|
||||
from ..asyn import AsyncFileSystem
|
||||
|
||||
|
||||
class DirFileSystem(AsyncFileSystem):
|
||||
"""Directory prefix filesystem
|
||||
|
||||
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
|
||||
is relative to the `path`. After performing the necessary paths operation it
|
||||
delegates everything to the wrapped filesystem.
|
||||
"""
|
||||
|
||||
protocol = "dir"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path=None,
|
||||
fs=None,
|
||||
fo=None,
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
**storage_options,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Path to the directory.
|
||||
fs: AbstractFileSystem
|
||||
An instantiated filesystem to wrap.
|
||||
target_protocol, target_options:
|
||||
if fs is none, construct it from these
|
||||
fo: str
|
||||
Alternate for path; do not provide both
|
||||
"""
|
||||
super().__init__(**storage_options)
|
||||
if fs is None:
|
||||
fs = filesystem(protocol=target_protocol, **(target_options or {}))
|
||||
path = path or fo
|
||||
|
||||
if self.asynchronous and not fs.async_impl:
|
||||
raise ValueError("can't use asynchronous with non-async fs")
|
||||
|
||||
if fs.async_impl and self.asynchronous != fs.asynchronous:
|
||||
raise ValueError("both dirfs and fs should be in the same sync/async mode")
|
||||
|
||||
self.path = fs._strip_protocol(path)
|
||||
self.fs = fs
|
||||
|
||||
def _join(self, path):
|
||||
if isinstance(path, str):
|
||||
if not self.path:
|
||||
return path
|
||||
if not path:
|
||||
return self.path
|
||||
return self.fs.sep.join((self.path, self._strip_protocol(path)))
|
||||
if isinstance(path, dict):
|
||||
return {self._join(_path): value for _path, value in path.items()}
|
||||
return [self._join(_path) for _path in path]
|
||||
|
||||
def _relpath(self, path):
|
||||
if isinstance(path, str):
|
||||
if not self.path:
|
||||
return path
|
||||
# We need to account for S3FileSystem returning paths that do not
|
||||
# start with a '/'
|
||||
if path == self.path or (
|
||||
self.path.startswith(self.fs.sep) and path == self.path[1:]
|
||||
):
|
||||
return ""
|
||||
prefix = self.path + self.fs.sep
|
||||
if self.path.startswith(self.fs.sep) and not path.startswith(self.fs.sep):
|
||||
prefix = prefix[1:]
|
||||
assert path.startswith(prefix)
|
||||
return path[len(prefix) :]
|
||||
return [self._relpath(_path) for _path in path]
|
||||
|
||||
# Wrappers below
|
||||
|
||||
@property
|
||||
def sep(self):
|
||||
return self.fs.sep
|
||||
|
||||
async def set_session(self, *args, **kwargs):
|
||||
return await self.fs.set_session(*args, **kwargs)
|
||||
|
||||
async def _rm_file(self, path, **kwargs):
|
||||
return await self.fs._rm_file(self._join(path), **kwargs)
|
||||
|
||||
def rm_file(self, path, **kwargs):
|
||||
return self.fs.rm_file(self._join(path), **kwargs)
|
||||
|
||||
async def _rm(self, path, *args, **kwargs):
|
||||
return await self.fs._rm(self._join(path), *args, **kwargs)
|
||||
|
||||
def rm(self, path, *args, **kwargs):
|
||||
return self.fs.rm(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _cp_file(self, path1, path2, **kwargs):
|
||||
return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
|
||||
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
|
||||
|
||||
async def _copy(
|
||||
self,
|
||||
path1,
|
||||
path2,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return await self.fs._copy(
|
||||
self._join(path1),
|
||||
self._join(path2),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def copy(self, path1, path2, *args, **kwargs):
|
||||
return self.fs.copy(
|
||||
self._join(path1),
|
||||
self._join(path2),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def _pipe(self, path, *args, **kwargs):
|
||||
return await self.fs._pipe(self._join(path), *args, **kwargs)
|
||||
|
||||
def pipe(self, path, *args, **kwargs):
|
||||
return self.fs.pipe(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _pipe_file(self, path, *args, **kwargs):
|
||||
return await self.fs._pipe_file(self._join(path), *args, **kwargs)
|
||||
|
||||
def pipe_file(self, path, *args, **kwargs):
|
||||
return self.fs.pipe_file(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _cat_file(self, path, *args, **kwargs):
|
||||
return await self.fs._cat_file(self._join(path), *args, **kwargs)
|
||||
|
||||
def cat_file(self, path, *args, **kwargs):
|
||||
return self.fs.cat_file(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _cat(self, path, *args, **kwargs):
|
||||
ret = await self.fs._cat(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if isinstance(ret, dict):
|
||||
return {self._relpath(key): value for key, value in ret.items()}
|
||||
|
||||
return ret
|
||||
|
||||
def cat(self, path, *args, **kwargs):
|
||||
ret = self.fs.cat(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if isinstance(ret, dict):
|
||||
return {self._relpath(key): value for key, value in ret.items()}
|
||||
|
||||
return ret
|
||||
|
||||
async def _put_file(self, lpath, rpath, **kwargs):
|
||||
return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
|
||||
|
||||
def put_file(self, lpath, rpath, **kwargs):
|
||||
return self.fs.put_file(lpath, self._join(rpath), **kwargs)
|
||||
|
||||
async def _put(
|
||||
self,
|
||||
lpath,
|
||||
rpath,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return await self.fs._put(
|
||||
lpath,
|
||||
self._join(rpath),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def put(self, lpath, rpath, *args, **kwargs):
|
||||
return self.fs.put(
|
||||
lpath,
|
||||
self._join(rpath),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def _get_file(self, rpath, lpath, **kwargs):
|
||||
return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
return self.fs.get_file(self._join(rpath), lpath, **kwargs)
|
||||
|
||||
async def _get(self, rpath, *args, **kwargs):
|
||||
return await self.fs._get(self._join(rpath), *args, **kwargs)
|
||||
|
||||
def get(self, rpath, *args, **kwargs):
|
||||
return self.fs.get(self._join(rpath), *args, **kwargs)
|
||||
|
||||
async def _isfile(self, path):
|
||||
return await self.fs._isfile(self._join(path))
|
||||
|
||||
def isfile(self, path):
|
||||
return self.fs.isfile(self._join(path))
|
||||
|
||||
async def _isdir(self, path):
|
||||
return await self.fs._isdir(self._join(path))
|
||||
|
||||
def isdir(self, path):
|
||||
return self.fs.isdir(self._join(path))
|
||||
|
||||
async def _size(self, path):
|
||||
return await self.fs._size(self._join(path))
|
||||
|
||||
def size(self, path):
|
||||
return self.fs.size(self._join(path))
|
||||
|
||||
async def _exists(self, path):
|
||||
return await self.fs._exists(self._join(path))
|
||||
|
||||
def exists(self, path):
|
||||
return self.fs.exists(self._join(path))
|
||||
|
||||
async def _info(self, path, **kwargs):
|
||||
info = await self.fs._info(self._join(path), **kwargs)
|
||||
info = info.copy()
|
||||
info["name"] = self._relpath(info["name"])
|
||||
return info
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
info = self.fs.info(self._join(path), **kwargs)
|
||||
info = info.copy()
|
||||
info["name"] = self._relpath(info["name"])
|
||||
return info
|
||||
|
||||
async def _ls(self, path, detail=True, **kwargs):
|
||||
ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
|
||||
if detail:
|
||||
out = []
|
||||
for entry in ret:
|
||||
entry = entry.copy()
|
||||
entry["name"] = self._relpath(entry["name"])
|
||||
out.append(entry)
|
||||
return out
|
||||
|
||||
return self._relpath(ret)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
|
||||
if detail:
|
||||
out = []
|
||||
for entry in ret:
|
||||
entry = entry.copy()
|
||||
entry["name"] = self._relpath(entry["name"])
|
||||
out.append(entry)
|
||||
return out
|
||||
|
||||
return self._relpath(ret)
|
||||
|
||||
async def _walk(self, path, *args, **kwargs):
|
||||
async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
|
||||
yield self._relpath(root), dirs, files
|
||||
|
||||
def walk(self, path, *args, **kwargs):
|
||||
for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
|
||||
yield self._relpath(root), dirs, files
|
||||
|
||||
async def _glob(self, path, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = await self.fs._glob(self._join(path), **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
def glob(self, path, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = self.fs.glob(self._join(path), **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
async def _du(self, path, *args, **kwargs):
|
||||
total = kwargs.get("total", True)
|
||||
ret = await self.fs._du(self._join(path), *args, **kwargs)
|
||||
if total:
|
||||
return ret
|
||||
|
||||
return {self._relpath(path): size for path, size in ret.items()}
|
||||
|
||||
def du(self, path, *args, **kwargs):
|
||||
total = kwargs.get("total", True)
|
||||
ret = self.fs.du(self._join(path), *args, **kwargs)
|
||||
if total:
|
||||
return ret
|
||||
|
||||
return {self._relpath(path): size for path, size in ret.items()}
|
||||
|
||||
async def _find(self, path, *args, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = await self.fs._find(self._join(path), *args, **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
def find(self, path, *args, **kwargs):
|
||||
detail = kwargs.get("detail", False)
|
||||
ret = self.fs.find(self._join(path), *args, **kwargs)
|
||||
if detail:
|
||||
return {self._relpath(path): info for path, info in ret.items()}
|
||||
return self._relpath(ret)
|
||||
|
||||
async def _expand_path(self, path, *args, **kwargs):
|
||||
return self._relpath(
|
||||
await self.fs._expand_path(self._join(path), *args, **kwargs)
|
||||
)
|
||||
|
||||
def expand_path(self, path, *args, **kwargs):
|
||||
return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
|
||||
|
||||
async def _mkdir(self, path, *args, **kwargs):
|
||||
return await self.fs._mkdir(self._join(path), *args, **kwargs)
|
||||
|
||||
def mkdir(self, path, *args, **kwargs):
|
||||
return self.fs.mkdir(self._join(path), *args, **kwargs)
|
||||
|
||||
async def _makedirs(self, path, *args, **kwargs):
|
||||
return await self.fs._makedirs(self._join(path), *args, **kwargs)
|
||||
|
||||
def makedirs(self, path, *args, **kwargs):
|
||||
return self.fs.makedirs(self._join(path), *args, **kwargs)
|
||||
|
||||
def rmdir(self, path):
|
||||
return self.fs.rmdir(self._join(path))
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
return self.fs.mv(
|
||||
self._join(path1),
|
||||
self._join(path2),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def touch(self, path, **kwargs):
|
||||
return self.fs.touch(self._join(path), **kwargs)
|
||||
|
||||
def created(self, path):
|
||||
return self.fs.created(self._join(path))
|
||||
|
||||
def modified(self, path):
|
||||
return self.fs.modified(self._join(path))
|
||||
|
||||
def sign(self, path, *args, **kwargs):
|
||||
return self.fs.sign(self._join(path), *args, **kwargs)
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
|
||||
|
||||
def open(
|
||||
self,
|
||||
path,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return self.fs.open(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def open_async(
|
||||
self,
|
||||
path,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
return await self.fs.open_async(
|
||||
self._join(path),
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
395
.venv/lib/python3.10/site-packages/fsspec/implementations/ftp.py
Normal file
395
.venv/lib/python3.10/site-packages/fsspec/implementations/ftp.py
Normal file
@@ -0,0 +1,395 @@
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
import warnings
|
||||
from ftplib import FTP, FTP_TLS, Error, error_perm
|
||||
from typing import Any
|
||||
|
||||
from ..spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from ..utils import infer_storage_options, isfilelike
|
||||
|
||||
|
||||
class FTPFileSystem(AbstractFileSystem):
|
||||
"""A filesystem over classic FTP"""
|
||||
|
||||
root_marker = "/"
|
||||
cachable = False
|
||||
protocol = "ftp"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host,
|
||||
port=21,
|
||||
username=None,
|
||||
password=None,
|
||||
acct=None,
|
||||
block_size=None,
|
||||
tempdir=None,
|
||||
timeout=30,
|
||||
encoding="utf-8",
|
||||
tls=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
You can use _get_kwargs_from_urls to get some kwargs from
|
||||
a reasonable FTP url.
|
||||
|
||||
Authentication will be anonymous if username/password are not
|
||||
given.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
The remote server name/ip to connect to
|
||||
port: int
|
||||
Port to connect with
|
||||
username: str or None
|
||||
If authenticating, the user's identifier
|
||||
password: str of None
|
||||
User's password on the server, if using
|
||||
acct: str or None
|
||||
Some servers also need an "account" string for auth
|
||||
block_size: int or None
|
||||
If given, the read-ahead or write buffer size.
|
||||
tempdir: str
|
||||
Directory on remote to put temporary files when in a transaction
|
||||
timeout: int
|
||||
Timeout of the ftp connection in seconds
|
||||
encoding: str
|
||||
Encoding to use for directories and filenames in FTP connection
|
||||
tls: bool
|
||||
Use FTP-TLS, by default False
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.tempdir = tempdir or "/tmp"
|
||||
self.cred = username or "", password or "", acct or ""
|
||||
self.timeout = timeout
|
||||
self.encoding = encoding
|
||||
if block_size is not None:
|
||||
self.blocksize = block_size
|
||||
else:
|
||||
self.blocksize = 2**16
|
||||
self.tls = tls
|
||||
self._connect()
|
||||
if self.tls:
|
||||
self.ftp.prot_p()
|
||||
|
||||
def _connect(self):
|
||||
if self.tls:
|
||||
ftp_cls = FTP_TLS
|
||||
else:
|
||||
ftp_cls = FTP
|
||||
if sys.version_info >= (3, 9):
|
||||
self.ftp = ftp_cls(timeout=self.timeout, encoding=self.encoding)
|
||||
elif self.encoding:
|
||||
warnings.warn("`encoding` not supported for python<3.9, ignoring")
|
||||
self.ftp = ftp_cls(timeout=self.timeout)
|
||||
else:
|
||||
self.ftp = ftp_cls(timeout=self.timeout)
|
||||
self.ftp.connect(self.host, self.port)
|
||||
self.ftp.login(*self.cred)
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(urlpath):
|
||||
out = infer_storage_options(urlpath)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
return out
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
out = []
|
||||
if path not in self.dircache:
|
||||
try:
|
||||
try:
|
||||
out = [
|
||||
(fn, details)
|
||||
for (fn, details) in self.ftp.mlsd(path)
|
||||
if fn not in [".", ".."]
|
||||
and details["type"] not in ["pdir", "cdir"]
|
||||
]
|
||||
except error_perm:
|
||||
out = _mlsd2(self.ftp, path) # Not platform independent
|
||||
for fn, details in out:
|
||||
details["name"] = "/".join(
|
||||
["" if path == "/" else path, fn.lstrip("/")]
|
||||
)
|
||||
if details["type"] == "file":
|
||||
details["size"] = int(details["size"])
|
||||
else:
|
||||
details["size"] = 0
|
||||
if details["type"] == "dir":
|
||||
details["type"] = "directory"
|
||||
self.dircache[path] = out
|
||||
except Error:
|
||||
try:
|
||||
info = self.info(path)
|
||||
if info["type"] == "file":
|
||||
out = [(path, info)]
|
||||
except (Error, IndexError) as exc:
|
||||
raise FileNotFoundError(path) from exc
|
||||
files = self.dircache.get(path, out)
|
||||
if not detail:
|
||||
return sorted([fn for fn, details in files])
|
||||
return [details for fn, details in files]
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
# implement with direct method
|
||||
path = self._strip_protocol(path)
|
||||
if path == "/":
|
||||
# special case, since this dir has no real entry
|
||||
return {"name": "/", "size": 0, "type": "directory"}
|
||||
files = self.ls(self._parent(path).lstrip("/"), True)
|
||||
try:
|
||||
out = next(f for f in files if f["name"] == path)
|
||||
except StopIteration as exc:
|
||||
raise FileNotFoundError(path) from exc
|
||||
return out
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
if self.isdir(rpath):
|
||||
if not os.path.exists(lpath):
|
||||
os.mkdir(lpath)
|
||||
return
|
||||
if isfilelike(lpath):
|
||||
outfile = lpath
|
||||
else:
|
||||
outfile = open(lpath, "wb")
|
||||
|
||||
def cb(x):
|
||||
outfile.write(x)
|
||||
|
||||
self.ftp.retrbinary(
|
||||
f"RETR {rpath}",
|
||||
blocksize=self.blocksize,
|
||||
callback=cb,
|
||||
)
|
||||
if not isfilelike(lpath):
|
||||
outfile.close()
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
if end is not None:
|
||||
return super().cat_file(path, start, end, **kwargs)
|
||||
out = []
|
||||
|
||||
def cb(x):
|
||||
out.append(x)
|
||||
|
||||
try:
|
||||
self.ftp.retrbinary(
|
||||
f"RETR {path}",
|
||||
blocksize=self.blocksize,
|
||||
rest=start,
|
||||
callback=cb,
|
||||
)
|
||||
except (Error, error_perm) as orig_exc:
|
||||
raise FileNotFoundError(path) from orig_exc
|
||||
return b"".join(out)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
cache_options=None,
|
||||
autocommit=True,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
block_size = block_size or self.blocksize
|
||||
return FTPFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
tempdir=self.tempdir,
|
||||
autocommit=autocommit,
|
||||
cache_options=cache_options,
|
||||
)
|
||||
|
||||
def _rm(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.ftp.delete(path)
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
||||
for p in reversed(paths):
|
||||
if self.isfile(p):
|
||||
self.rm_file(p)
|
||||
else:
|
||||
self.rmdir(p)
|
||||
|
||||
def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
|
||||
path = self._strip_protocol(path)
|
||||
parent = self._parent(path)
|
||||
if parent != self.root_marker and not self.exists(parent) and create_parents:
|
||||
self.mkdir(parent, create_parents=create_parents)
|
||||
|
||||
self.ftp.mkd(path)
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def makedirs(self, path: str, exist_ok: bool = False) -> None:
|
||||
path = self._strip_protocol(path)
|
||||
if self.exists(path):
|
||||
# NB: "/" does not "exist" as it has no directory entry
|
||||
if not exist_ok:
|
||||
raise FileExistsError(f"{path} exists without `exist_ok`")
|
||||
# exists_ok=True -> no-op
|
||||
else:
|
||||
self.mkdir(path, create_parents=True)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.ftp.rmd(path)
|
||||
self.invalidate_cache(self._parent(path))
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
self.ftp.rename(path1, path2)
|
||||
self.invalidate_cache(self._parent(path1))
|
||||
self.invalidate_cache(self._parent(path2))
|
||||
|
||||
def __del__(self):
|
||||
self.ftp.close()
|
||||
|
||||
def invalidate_cache(self, path=None):
|
||||
if path is None:
|
||||
self.dircache.clear()
|
||||
else:
|
||||
self.dircache.pop(path, None)
|
||||
super().invalidate_cache(path)
|
||||
|
||||
|
||||
class TransferDone(Exception):
|
||||
"""Internal exception to break out of transfer"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FTPFile(AbstractBufferedFile):
|
||||
"""Interact with a remote FTP file with read/write buffering"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size="default",
|
||||
autocommit=True,
|
||||
cache_type="readahead",
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
fs,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
autocommit=autocommit,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
if not autocommit:
|
||||
self.target = self.path
|
||||
self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
|
||||
|
||||
def commit(self):
|
||||
self.fs.mv(self.path, self.target)
|
||||
|
||||
def discard(self):
|
||||
self.fs.rm(self.path)
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Get bytes between given byte limits
|
||||
|
||||
Implemented by raising an exception in the fetch callback when the
|
||||
number of bytes received reaches the requested amount.
|
||||
|
||||
Will fail if the server does not respect the REST command on
|
||||
retrieve requests.
|
||||
"""
|
||||
out = []
|
||||
total = [0]
|
||||
|
||||
def callback(x):
|
||||
total[0] += len(x)
|
||||
if total[0] > end - start:
|
||||
out.append(x[: (end - start) - total[0]])
|
||||
if end < self.size:
|
||||
raise TransferDone
|
||||
else:
|
||||
out.append(x)
|
||||
|
||||
if total[0] == end - start and end < self.size:
|
||||
raise TransferDone
|
||||
|
||||
try:
|
||||
self.fs.ftp.retrbinary(
|
||||
f"RETR {self.path}",
|
||||
blocksize=self.blocksize,
|
||||
rest=start,
|
||||
callback=callback,
|
||||
)
|
||||
except TransferDone:
|
||||
try:
|
||||
# stop transfer, we got enough bytes for this block
|
||||
self.fs.ftp.abort()
|
||||
self.fs.ftp.getmultiline()
|
||||
except Error:
|
||||
self.fs._connect()
|
||||
|
||||
return b"".join(out)
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
self.buffer.seek(0)
|
||||
self.fs.ftp.storbinary(
|
||||
f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _mlsd2(ftp, path="."):
|
||||
"""
|
||||
Fall back to using `dir` instead of `mlsd` if not supported.
|
||||
|
||||
This parses a Linux style `ls -l` response to `dir`, but the response may
|
||||
be platform dependent.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ftp: ftplib.FTP
|
||||
path: str
|
||||
Expects to be given path, but defaults to ".".
|
||||
"""
|
||||
lines = []
|
||||
minfo = []
|
||||
ftp.dir(path, lines.append)
|
||||
for line in lines:
|
||||
split_line = line.split()
|
||||
if len(split_line) < 9:
|
||||
continue
|
||||
this = (
|
||||
split_line[-1],
|
||||
{
|
||||
"modify": " ".join(split_line[5:8]),
|
||||
"unix.owner": split_line[2],
|
||||
"unix.group": split_line[3],
|
||||
"unix.mode": split_line[0],
|
||||
"size": split_line[4],
|
||||
},
|
||||
)
|
||||
if this[1]["unix.mode"][0] == "d":
|
||||
this[1]["type"] = "dir"
|
||||
else:
|
||||
this[1]["type"] = "file"
|
||||
minfo.append(this)
|
||||
return minfo
|
||||
115
.venv/lib/python3.10/site-packages/fsspec/implementations/git.py
Normal file
115
.venv/lib/python3.10/site-packages/fsspec/implementations/git.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import os
|
||||
|
||||
import pygit2
|
||||
|
||||
from fsspec.spec import AbstractFileSystem
|
||||
|
||||
from .memory import MemoryFile
|
||||
|
||||
|
||||
class GitFileSystem(AbstractFileSystem):
|
||||
"""Browse the files of a local git repo at any hash/tag/branch
|
||||
|
||||
(experimental backend)
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
cachable = True
|
||||
|
||||
def __init__(self, path=None, fo=None, ref=None, **kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str (optional)
|
||||
Local location of the repo (uses current directory if not given).
|
||||
May be deprecated in favour of ``fo``. When used with a higher
|
||||
level function such as fsspec.open(), may be of the form
|
||||
"git://[path-to-repo[:]][ref@]path/to/file" (but the actual
|
||||
file path should not contain "@" or ":").
|
||||
fo: str (optional)
|
||||
Same as ``path``, but passed as part of a chained URL. This one
|
||||
takes precedence if both are given.
|
||||
ref: str (optional)
|
||||
Reference to work with, could be a hash, tag or branch name. Defaults
|
||||
to current working tree. Note that ``ls`` and ``open`` also take hash,
|
||||
so this becomes the default for those operations
|
||||
kwargs
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.repo = pygit2.Repository(fo or path or os.getcwd())
|
||||
self.ref = ref or "master"
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
path = super()._strip_protocol(path).lstrip("/")
|
||||
if ":" in path:
|
||||
path = path.split(":", 1)[1]
|
||||
if "@" in path:
|
||||
path = path.split("@", 1)[1]
|
||||
return path.lstrip("/")
|
||||
|
||||
def _path_to_object(self, path, ref):
|
||||
comm, ref = self.repo.resolve_refish(ref or self.ref)
|
||||
parts = path.split("/")
|
||||
tree = comm.tree
|
||||
for part in parts:
|
||||
if part and isinstance(tree, pygit2.Tree):
|
||||
if part not in tree:
|
||||
raise FileNotFoundError(path)
|
||||
tree = tree[part]
|
||||
return tree
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
if path.startswith("git://"):
|
||||
path = path[6:]
|
||||
out = {}
|
||||
if ":" in path:
|
||||
out["path"], path = path.split(":", 1)
|
||||
if "@" in path:
|
||||
out["ref"], path = path.split("@", 1)
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _object_to_info(obj, path=None):
|
||||
# obj.name and obj.filemode are None for the root tree!
|
||||
is_dir = isinstance(obj, pygit2.Tree)
|
||||
return {
|
||||
"type": "directory" if is_dir else "file",
|
||||
"name": (
|
||||
"/".join([path, obj.name or ""]).lstrip("/") if path else obj.name
|
||||
),
|
||||
"hex": str(obj.id),
|
||||
"mode": "100644" if obj.filemode is None else f"{obj.filemode:o}",
|
||||
"size": 0 if is_dir else obj.size,
|
||||
}
|
||||
|
||||
def ls(self, path, detail=True, ref=None, **kwargs):
|
||||
tree = self._path_to_object(self._strip_protocol(path), ref)
|
||||
return [
|
||||
GitFileSystem._object_to_info(obj, path)
|
||||
if detail
|
||||
else GitFileSystem._object_to_info(obj, path)["name"]
|
||||
for obj in (tree if isinstance(tree, pygit2.Tree) else [tree])
|
||||
]
|
||||
|
||||
def info(self, path, ref=None, **kwargs):
|
||||
tree = self._path_to_object(self._strip_protocol(path), ref)
|
||||
return GitFileSystem._object_to_info(tree, path)
|
||||
|
||||
def ukey(self, path, ref=None):
|
||||
return self.info(path, ref=ref)["hex"]
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
ref=None,
|
||||
**kwargs,
|
||||
):
|
||||
obj = self._path_to_object(path, ref or self.ref)
|
||||
return MemoryFile(data=obj.data)
|
||||
@@ -0,0 +1,267 @@
|
||||
import base64
|
||||
|
||||
import requests
|
||||
|
||||
from ..spec import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
from .memory import MemoryFile
|
||||
|
||||
# TODO: add GIST backend, would be very similar
|
||||
|
||||
|
||||
class GithubFileSystem(AbstractFileSystem):
|
||||
"""Interface to files in github
|
||||
|
||||
An instance of this class provides the files residing within a remote github
|
||||
repository. You may specify a point in the repos history, by SHA, branch
|
||||
or tag (default is current master).
|
||||
|
||||
For files less than 1 MB in size, file content is returned directly in a
|
||||
MemoryFile. For larger files, or for files tracked by git-lfs, file content
|
||||
is returned as an HTTPFile wrapping the ``download_url`` provided by the
|
||||
GitHub API.
|
||||
|
||||
When using fsspec.open, allows URIs of the form:
|
||||
|
||||
- "github://path/file", in which case you must specify org, repo and
|
||||
may specify sha in the extra args
|
||||
- 'github://org:repo@/precip/catalog.yml', where the org and repo are
|
||||
part of the URI
|
||||
- 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
|
||||
|
||||
``sha`` can be the full or abbreviated hex of the commit you want to fetch
|
||||
from, or a branch or tag name (so long as it doesn't contain special characters
|
||||
like "/", "?", which would have to be HTTP-encoded).
|
||||
|
||||
For authorised access, you must provide username and token, which can be made
|
||||
at https://github.com/settings/tokens
|
||||
"""
|
||||
|
||||
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
|
||||
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
|
||||
protocol = "github"
|
||||
timeout = (60, 60) # connect, read timeouts
|
||||
|
||||
def __init__(
|
||||
self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.org = org
|
||||
self.repo = repo
|
||||
if (username is None) ^ (token is None):
|
||||
raise ValueError("Auth required both username and token")
|
||||
self.username = username
|
||||
self.token = token
|
||||
if timeout is not None:
|
||||
self.timeout = timeout
|
||||
if sha is None:
|
||||
# look up default branch (not necessarily "master")
|
||||
u = "https://api.github.com/repos/{org}/{repo}"
|
||||
r = requests.get(
|
||||
u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
|
||||
)
|
||||
r.raise_for_status()
|
||||
sha = r.json()["default_branch"]
|
||||
|
||||
self.root = sha
|
||||
self.ls("")
|
||||
try:
|
||||
from .http import HTTPFileSystem
|
||||
|
||||
self.http_fs = HTTPFileSystem(**kwargs)
|
||||
except ImportError:
|
||||
self.http_fs = None
|
||||
|
||||
@property
|
||||
def kw(self):
|
||||
if self.username:
|
||||
return {"auth": (self.username, self.token)}
|
||||
return {}
|
||||
|
||||
@classmethod
|
||||
def repos(cls, org_or_user, is_org=True):
|
||||
"""List repo names for given org or user
|
||||
|
||||
This may become the top level of the FS
|
||||
|
||||
Parameters
|
||||
----------
|
||||
org_or_user: str
|
||||
Name of the github org or user to query
|
||||
is_org: bool (default True)
|
||||
Whether the name is an organisation (True) or user (False)
|
||||
|
||||
Returns
|
||||
-------
|
||||
List of string
|
||||
"""
|
||||
r = requests.get(
|
||||
f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
|
||||
timeout=cls.timeout,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return [repo["name"] for repo in r.json()]
|
||||
|
||||
@property
|
||||
def tags(self):
|
||||
"""Names of tags in the repo"""
|
||||
r = requests.get(
|
||||
f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
|
||||
timeout=self.timeout,
|
||||
**self.kw,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return [t["name"] for t in r.json()]
|
||||
|
||||
@property
|
||||
def branches(self):
|
||||
"""Names of branches in the repo"""
|
||||
r = requests.get(
|
||||
f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
|
||||
timeout=self.timeout,
|
||||
**self.kw,
|
||||
)
|
||||
r.raise_for_status()
|
||||
return [t["name"] for t in r.json()]
|
||||
|
||||
@property
|
||||
def refs(self):
|
||||
"""Named references, tags and branches"""
|
||||
return {"tags": self.tags, "branches": self.branches}
|
||||
|
||||
def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
|
||||
"""List files at given path
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Location to list, relative to repo root
|
||||
detail: bool
|
||||
If True, returns list of dicts, one per file; if False, returns
|
||||
list of full filenames only
|
||||
sha: str (optional)
|
||||
List at the given point in the repo history, branch or tag name or commit
|
||||
SHA
|
||||
_sha: str (optional)
|
||||
List this specific tree object (used internally to descend into trees)
|
||||
"""
|
||||
path = self._strip_protocol(path)
|
||||
if path == "":
|
||||
_sha = sha or self.root
|
||||
if _sha is None:
|
||||
parts = path.rstrip("/").split("/")
|
||||
so_far = ""
|
||||
_sha = sha or self.root
|
||||
for part in parts:
|
||||
out = self.ls(so_far, True, sha=sha, _sha=_sha)
|
||||
so_far += "/" + part if so_far else part
|
||||
out = [o for o in out if o["name"] == so_far]
|
||||
if not out:
|
||||
raise FileNotFoundError(path)
|
||||
out = out[0]
|
||||
if out["type"] == "file":
|
||||
if detail:
|
||||
return [out]
|
||||
else:
|
||||
return path
|
||||
_sha = out["sha"]
|
||||
if path not in self.dircache or sha not in [self.root, None]:
|
||||
r = requests.get(
|
||||
self.url.format(org=self.org, repo=self.repo, sha=_sha),
|
||||
timeout=self.timeout,
|
||||
**self.kw,
|
||||
)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
types = {"blob": "file", "tree": "directory"}
|
||||
out = [
|
||||
{
|
||||
"name": path + "/" + f["path"] if path else f["path"],
|
||||
"mode": f["mode"],
|
||||
"type": types[f["type"]],
|
||||
"size": f.get("size", 0),
|
||||
"sha": f["sha"],
|
||||
}
|
||||
for f in r.json()["tree"]
|
||||
if f["type"] in types
|
||||
]
|
||||
if sha in [self.root, None]:
|
||||
self.dircache[path] = out
|
||||
else:
|
||||
out = self.dircache[path]
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return sorted([f["name"] for f in out])
|
||||
|
||||
def invalidate_cache(self, path=None):
|
||||
self.dircache.clear()
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
opts = infer_storage_options(path)
|
||||
if "username" not in opts:
|
||||
return super()._strip_protocol(path)
|
||||
return opts["path"].lstrip("/")
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
opts = infer_storage_options(path)
|
||||
if "username" not in opts:
|
||||
return {}
|
||||
out = {"org": opts["username"], "repo": opts["password"]}
|
||||
if opts["host"]:
|
||||
out["sha"] = opts["host"]
|
||||
return out
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
cache_options=None,
|
||||
sha=None,
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
|
||||
# construct a url to hit the GitHub API's repo contents API
|
||||
url = self.content_url.format(
|
||||
org=self.org, repo=self.repo, path=path, sha=sha or self.root
|
||||
)
|
||||
|
||||
# make a request to this API, and parse the response as JSON
|
||||
r = requests.get(url, timeout=self.timeout, **self.kw)
|
||||
if r.status_code == 404:
|
||||
raise FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
content_json = r.json()
|
||||
|
||||
# if the response's content key is not empty, try to parse it as base64
|
||||
if content_json["content"]:
|
||||
content = base64.b64decode(content_json["content"])
|
||||
|
||||
# as long as the content does not start with the string
|
||||
# "version https://git-lfs.github.com/"
|
||||
# then it is probably not a git-lfs pointer and we can just return
|
||||
# the content directly
|
||||
if not content.startswith(b"version https://git-lfs.github.com/"):
|
||||
return MemoryFile(None, None, content)
|
||||
|
||||
# we land here if the content was not present in the first response
|
||||
# (regular file over 1MB or git-lfs tracked file)
|
||||
# in this case, we get let the HTTPFileSystem handle the download
|
||||
if self.http_fs is None:
|
||||
raise ImportError(
|
||||
"Please install fsspec[http] to access github files >1 MB "
|
||||
"or git-lfs tracked files."
|
||||
)
|
||||
return self.http_fs.open(
|
||||
content_json["download_url"],
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -0,0 +1,880 @@
|
||||
import asyncio
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import weakref
|
||||
from copy import copy
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
import yarl
|
||||
|
||||
from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
|
||||
from fsspec.callbacks import DEFAULT_CALLBACK
|
||||
from fsspec.exceptions import FSTimeoutError
|
||||
from fsspec.spec import AbstractBufferedFile
|
||||
from fsspec.utils import (
|
||||
DEFAULT_BLOCK_SIZE,
|
||||
glob_translate,
|
||||
isfilelike,
|
||||
nullcontext,
|
||||
tokenize,
|
||||
)
|
||||
|
||||
from ..caching import AllBytes
|
||||
|
||||
# https://stackoverflow.com/a/15926317/3821154
|
||||
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
|
||||
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
|
||||
logger = logging.getLogger("fsspec.http")
|
||||
|
||||
|
||||
async def get_client(**kwargs):
|
||||
return aiohttp.ClientSession(**kwargs)
|
||||
|
||||
|
||||
class HTTPFileSystem(AsyncFileSystem):
|
||||
"""
|
||||
Simple File-System for fetching data via HTTP(S)
|
||||
|
||||
``ls()`` is implemented by loading the parent page and doing a regex
|
||||
match on the result. If simple_link=True, anything of the form
|
||||
"http(s)://server.com/stuff?thing=other"; otherwise only links within
|
||||
HTML href tags will be used.
|
||||
"""
|
||||
|
||||
sep = "/"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
simple_links=True,
|
||||
block_size=None,
|
||||
same_scheme=True,
|
||||
size_policy=None,
|
||||
cache_type="bytes",
|
||||
cache_options=None,
|
||||
asynchronous=False,
|
||||
loop=None,
|
||||
client_kwargs=None,
|
||||
get_client=get_client,
|
||||
encoded=False,
|
||||
**storage_options,
|
||||
):
|
||||
"""
|
||||
NB: if this is called async, you must await set_client
|
||||
|
||||
Parameters
|
||||
----------
|
||||
block_size: int
|
||||
Blocks to read bytes; if 0, will default to raw requests file-like
|
||||
objects instead of HTTPFile instances
|
||||
simple_links: bool
|
||||
If True, will consider both HTML <a> tags and anything that looks
|
||||
like a URL; if False, will consider only the former.
|
||||
same_scheme: True
|
||||
When doing ls/glob, if this is True, only consider paths that have
|
||||
http/https matching the input URLs.
|
||||
size_policy: this argument is deprecated
|
||||
client_kwargs: dict
|
||||
Passed to aiohttp.ClientSession, see
|
||||
https://docs.aiohttp.org/en/stable/client_reference.html
|
||||
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
|
||||
get_client: Callable[..., aiohttp.ClientSession]
|
||||
A callable which takes keyword arguments and constructs
|
||||
an aiohttp.ClientSession. It's state will be managed by
|
||||
the HTTPFileSystem class.
|
||||
storage_options: key-value
|
||||
Any other parameters passed on to requests
|
||||
cache_type, cache_options: defaults used in open
|
||||
"""
|
||||
super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
|
||||
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
|
||||
self.simple_links = simple_links
|
||||
self.same_schema = same_scheme
|
||||
self.cache_type = cache_type
|
||||
self.cache_options = cache_options
|
||||
self.client_kwargs = client_kwargs or {}
|
||||
self.get_client = get_client
|
||||
self.encoded = encoded
|
||||
self.kwargs = storage_options
|
||||
self._session = None
|
||||
|
||||
# Clean caching-related parameters from `storage_options`
|
||||
# before propagating them as `request_options` through `self.kwargs`.
|
||||
# TODO: Maybe rename `self.kwargs` to `self.request_options` to make
|
||||
# it clearer.
|
||||
request_options = copy(storage_options)
|
||||
self.use_listings_cache = request_options.pop("use_listings_cache", False)
|
||||
request_options.pop("listings_expiry_time", None)
|
||||
request_options.pop("max_paths", None)
|
||||
request_options.pop("skip_instance_cache", None)
|
||||
self.kwargs = request_options
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return "http"
|
||||
|
||||
def encode_url(self, url):
|
||||
return yarl.URL(url, encoded=self.encoded)
|
||||
|
||||
@staticmethod
|
||||
def close_session(loop, session):
|
||||
if loop is not None and loop.is_running():
|
||||
try:
|
||||
sync(loop, session.close, timeout=0.1)
|
||||
return
|
||||
except (TimeoutError, FSTimeoutError, NotImplementedError):
|
||||
pass
|
||||
connector = getattr(session, "_connector", None)
|
||||
if connector is not None:
|
||||
# close after loop is dead
|
||||
connector._close()
|
||||
|
||||
async def set_session(self):
|
||||
if self._session is None:
|
||||
self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
|
||||
if not self.asynchronous:
|
||||
weakref.finalize(self, self.close_session, self.loop, self._session)
|
||||
return self._session
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
"""For HTTP, we always want to keep the full URL"""
|
||||
return path
|
||||
|
||||
@classmethod
|
||||
def _parent(cls, path):
|
||||
# override, since _strip_protocol is different for URLs
|
||||
par = super()._parent(path)
|
||||
if len(par) > 7: # "http://..."
|
||||
return par
|
||||
return ""
|
||||
|
||||
async def _ls_real(self, url, detail=True, **kwargs):
|
||||
# ignoring URL-encoded arguments
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
session = await self.set_session()
|
||||
async with session.get(self.encode_url(url), **self.kwargs) as r:
|
||||
self._raise_not_found_for_status(r, url)
|
||||
try:
|
||||
text = await r.text()
|
||||
if self.simple_links:
|
||||
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
|
||||
else:
|
||||
links = [u[2] for u in ex.findall(text)]
|
||||
except UnicodeDecodeError:
|
||||
links = [] # binary, not HTML
|
||||
out = set()
|
||||
parts = urlparse(url)
|
||||
for l in links:
|
||||
if isinstance(l, tuple):
|
||||
l = l[1]
|
||||
if l.startswith("/") and len(l) > 1:
|
||||
# absolute URL on this server
|
||||
l = f"{parts.scheme}://{parts.netloc}{l}"
|
||||
if l.startswith("http"):
|
||||
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
|
||||
out.add(l)
|
||||
elif l.replace("https", "http").startswith(
|
||||
url.replace("https", "http").rstrip("/") + "/"
|
||||
):
|
||||
# allowed to cross http <-> https
|
||||
out.add(l)
|
||||
else:
|
||||
if l not in ["..", "../"]:
|
||||
# Ignore FTP-like "parent"
|
||||
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
|
||||
if not out and url.endswith("/"):
|
||||
out = await self._ls_real(url.rstrip("/"), detail=False)
|
||||
if detail:
|
||||
return [
|
||||
{
|
||||
"name": u,
|
||||
"size": None,
|
||||
"type": "directory" if u.endswith("/") else "file",
|
||||
}
|
||||
for u in out
|
||||
]
|
||||
else:
|
||||
return sorted(out)
|
||||
|
||||
async def _ls(self, url, detail=True, **kwargs):
|
||||
if self.use_listings_cache and url in self.dircache:
|
||||
out = self.dircache[url]
|
||||
else:
|
||||
out = await self._ls_real(url, detail=detail, **kwargs)
|
||||
self.dircache[url] = out
|
||||
return out
|
||||
|
||||
ls = sync_wrapper(_ls)
|
||||
|
||||
def _raise_not_found_for_status(self, response, url):
|
||||
"""
|
||||
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
|
||||
"""
|
||||
if response.status == 404:
|
||||
raise FileNotFoundError(url)
|
||||
response.raise_for_status()
|
||||
|
||||
async def _cat_file(self, url, start=None, end=None, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
|
||||
if start is not None or end is not None:
|
||||
if start == end:
|
||||
return b""
|
||||
headers = kw.pop("headers", {}).copy()
|
||||
|
||||
headers["Range"] = await self._process_limits(url, start, end)
|
||||
kw["headers"] = headers
|
||||
session = await self.set_session()
|
||||
async with session.get(self.encode_url(url), **kw) as r:
|
||||
out = await r.read()
|
||||
self._raise_not_found_for_status(r, url)
|
||||
return out
|
||||
|
||||
async def _get_file(
|
||||
self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
|
||||
):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(rpath)
|
||||
session = await self.set_session()
|
||||
async with session.get(self.encode_url(rpath), **kw) as r:
|
||||
try:
|
||||
size = int(r.headers["content-length"])
|
||||
except (ValueError, KeyError):
|
||||
size = None
|
||||
|
||||
callback.set_size(size)
|
||||
self._raise_not_found_for_status(r, rpath)
|
||||
if isfilelike(lpath):
|
||||
outfile = lpath
|
||||
else:
|
||||
outfile = open(lpath, "wb") # noqa: ASYNC101, ASYNC230
|
||||
|
||||
try:
|
||||
chunk = True
|
||||
while chunk:
|
||||
chunk = await r.content.read(chunk_size)
|
||||
outfile.write(chunk)
|
||||
callback.relative_update(len(chunk))
|
||||
finally:
|
||||
if not isfilelike(lpath):
|
||||
outfile.close()
|
||||
|
||||
async def _put_file(
|
||||
self,
|
||||
lpath,
|
||||
rpath,
|
||||
chunk_size=5 * 2**20,
|
||||
callback=DEFAULT_CALLBACK,
|
||||
method="post",
|
||||
mode="overwrite",
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "overwrite":
|
||||
raise NotImplementedError("Exclusive write")
|
||||
|
||||
async def gen_chunks():
|
||||
# Support passing arbitrary file-like objects
|
||||
# and use them instead of streams.
|
||||
if isinstance(lpath, io.IOBase):
|
||||
context = nullcontext(lpath)
|
||||
use_seek = False # might not support seeking
|
||||
else:
|
||||
context = open(lpath, "rb") # noqa: ASYNC101, ASYNC230
|
||||
use_seek = True
|
||||
|
||||
with context as f:
|
||||
if use_seek:
|
||||
callback.set_size(f.seek(0, 2))
|
||||
f.seek(0)
|
||||
else:
|
||||
callback.set_size(getattr(f, "size", None))
|
||||
|
||||
chunk = f.read(chunk_size)
|
||||
while chunk:
|
||||
yield chunk
|
||||
callback.relative_update(len(chunk))
|
||||
chunk = f.read(chunk_size)
|
||||
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
session = await self.set_session()
|
||||
|
||||
method = method.lower()
|
||||
if method not in ("post", "put"):
|
||||
raise ValueError(
|
||||
f"method has to be either 'post' or 'put', not: {method!r}"
|
||||
)
|
||||
|
||||
meth = getattr(session, method)
|
||||
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
|
||||
self._raise_not_found_for_status(resp, rpath)
|
||||
|
||||
async def _exists(self, path, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
try:
|
||||
logger.debug(path)
|
||||
session = await self.set_session()
|
||||
r = await session.get(self.encode_url(path), **kw)
|
||||
async with r:
|
||||
return r.status < 400
|
||||
except aiohttp.ClientError:
|
||||
return False
|
||||
|
||||
async def _isfile(self, path, **kwargs):
|
||||
return await self._exists(path, **kwargs)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=None, # XXX: This differs from the base class.
|
||||
cache_type=None,
|
||||
cache_options=None,
|
||||
size=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Make a file-like object
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Full URL with protocol
|
||||
mode: string
|
||||
must be "rb"
|
||||
block_size: int or None
|
||||
Bytes to download in one request; use instance value if None. If
|
||||
zero, will return a streaming Requests file-like instance.
|
||||
kwargs: key-value
|
||||
Any other parameters, passed to requests calls
|
||||
"""
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
block_size = block_size if block_size is not None else self.block_size
|
||||
kw = self.kwargs.copy()
|
||||
kw["asynchronous"] = self.asynchronous
|
||||
kw.update(kwargs)
|
||||
info = {}
|
||||
size = size or info.update(self.info(path, **kwargs)) or info["size"]
|
||||
session = sync(self.loop, self.set_session)
|
||||
if block_size and size and info.get("partial", True):
|
||||
return HTTPFile(
|
||||
self,
|
||||
path,
|
||||
session=session,
|
||||
block_size=block_size,
|
||||
mode=mode,
|
||||
size=size,
|
||||
cache_type=cache_type or self.cache_type,
|
||||
cache_options=cache_options or self.cache_options,
|
||||
loop=self.loop,
|
||||
**kw,
|
||||
)
|
||||
else:
|
||||
return HTTPStreamFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
loop=self.loop,
|
||||
session=session,
|
||||
**kw,
|
||||
)
|
||||
|
||||
async def open_async(self, path, mode="rb", size=None, **kwargs):
|
||||
session = await self.set_session()
|
||||
if size is None:
|
||||
try:
|
||||
size = (await self._info(path, **kwargs))["size"]
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return AsyncStreamFile(
|
||||
self,
|
||||
path,
|
||||
loop=self.loop,
|
||||
session=session,
|
||||
size=size,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def ukey(self, url):
|
||||
"""Unique identifier; assume HTTP files are static, unchanging"""
|
||||
return tokenize(url, self.kwargs, self.protocol)
|
||||
|
||||
async def _info(self, url, **kwargs):
|
||||
"""Get info of URL
|
||||
|
||||
Tries to access location via HEAD, and then GET methods, but does
|
||||
not fetch the data.
|
||||
|
||||
It is possible that the server does not supply any size information, in
|
||||
which case size will be given as None (and certain operations on the
|
||||
corresponding file will not work).
|
||||
"""
|
||||
info = {}
|
||||
session = await self.set_session()
|
||||
|
||||
for policy in ["head", "get"]:
|
||||
try:
|
||||
info.update(
|
||||
await _file_info(
|
||||
self.encode_url(url),
|
||||
size_policy=policy,
|
||||
session=session,
|
||||
**self.kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
if info.get("size") is not None:
|
||||
break
|
||||
except Exception as exc:
|
||||
if policy == "get":
|
||||
# If get failed, then raise a FileNotFoundError
|
||||
raise FileNotFoundError(url) from exc
|
||||
logger.debug("", exc_info=exc)
|
||||
|
||||
return {"name": url, "size": None, **info, "type": "file"}
|
||||
|
||||
async def _glob(self, path, maxdepth=None, **kwargs):
|
||||
"""
|
||||
Find files by glob-matching.
|
||||
|
||||
This implementation is idntical to the one in AbstractFileSystem,
|
||||
but "?" is not considered as a character for globbing, because it is
|
||||
so common in URLs, often identifying the "query" part.
|
||||
"""
|
||||
if maxdepth is not None and maxdepth < 1:
|
||||
raise ValueError("maxdepth must be at least 1")
|
||||
import re
|
||||
|
||||
ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
|
||||
path = self._strip_protocol(path)
|
||||
append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
|
||||
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
|
||||
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
|
||||
|
||||
min_idx = min(idx_star, idx_brace)
|
||||
|
||||
detail = kwargs.pop("detail", False)
|
||||
|
||||
if not has_magic(path):
|
||||
if await self._exists(path, **kwargs):
|
||||
if not detail:
|
||||
return [path]
|
||||
else:
|
||||
return {path: await self._info(path, **kwargs)}
|
||||
else:
|
||||
if not detail:
|
||||
return [] # glob of non-existent returns empty
|
||||
else:
|
||||
return {}
|
||||
elif "/" in path[:min_idx]:
|
||||
min_idx = path[:min_idx].rindex("/")
|
||||
root = path[: min_idx + 1]
|
||||
depth = path[min_idx + 1 :].count("/") + 1
|
||||
else:
|
||||
root = ""
|
||||
depth = path[min_idx + 1 :].count("/") + 1
|
||||
|
||||
if "**" in path:
|
||||
if maxdepth is not None:
|
||||
idx_double_stars = path.find("**")
|
||||
depth_double_stars = path[idx_double_stars:].count("/") + 1
|
||||
depth = depth - depth_double_stars + maxdepth
|
||||
else:
|
||||
depth = None
|
||||
|
||||
allpaths = await self._find(
|
||||
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
|
||||
)
|
||||
|
||||
pattern = glob_translate(path + ("/" if ends_with_slash else ""))
|
||||
pattern = re.compile(pattern)
|
||||
|
||||
out = {
|
||||
(
|
||||
p.rstrip("/")
|
||||
if not append_slash_to_dirname
|
||||
and info["type"] == "directory"
|
||||
and p.endswith("/")
|
||||
else p
|
||||
): info
|
||||
for p, info in sorted(allpaths.items())
|
||||
if pattern.match(p.rstrip("/"))
|
||||
}
|
||||
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return list(out)
|
||||
|
||||
async def _isdir(self, path):
|
||||
# override, since all URLs are (also) files
|
||||
try:
|
||||
return bool(await self._ls(path))
|
||||
except (FileNotFoundError, ValueError):
|
||||
return False
|
||||
|
||||
async def _pipe_file(self, path, value, mode="overwrite", **kwargs):
|
||||
"""
|
||||
Write bytes to a remote file over HTTP.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Target URL where the data should be written
|
||||
value : bytes
|
||||
Data to be written
|
||||
mode : str
|
||||
How to write to the file - 'overwrite' or 'append'
|
||||
**kwargs : dict
|
||||
Additional parameters to pass to the HTTP request
|
||||
"""
|
||||
url = self._strip_protocol(path)
|
||||
headers = kwargs.pop("headers", {})
|
||||
headers["Content-Length"] = str(len(value))
|
||||
|
||||
session = await self.set_session()
|
||||
|
||||
async with session.put(url, data=value, headers=headers, **kwargs) as r:
|
||||
r.raise_for_status()
|
||||
|
||||
|
||||
class HTTPFile(AbstractBufferedFile):
|
||||
"""
|
||||
A file-like object pointing to a remote HTTP(S) resource
|
||||
|
||||
Supports only reading, with read-ahead of a predetermined block-size.
|
||||
|
||||
In the case that the server does not supply the filesize, only reading of
|
||||
the complete file in one go is supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
Full URL of the remote resource, including the protocol
|
||||
session: aiohttp.ClientSession or None
|
||||
All calls will be made within this session, to avoid restarting
|
||||
connections where the server allows this
|
||||
block_size: int or None
|
||||
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
|
||||
configured for the FileSystem creating this file
|
||||
size: None or int
|
||||
If given, this is the size of the file in bytes, and we don't attempt
|
||||
to call the server to find the value.
|
||||
kwargs: all other key-values are passed to requests calls.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
url,
|
||||
session=None,
|
||||
block_size=None,
|
||||
mode="rb",
|
||||
cache_type="bytes",
|
||||
cache_options=None,
|
||||
size=None,
|
||||
loop=None,
|
||||
asynchronous=False,
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError("File mode not supported")
|
||||
self.asynchronous = asynchronous
|
||||
self.loop = loop
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.details = {"name": url, "size": size, "type": "file"}
|
||||
super().__init__(
|
||||
fs=fs,
|
||||
path=url,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def read(self, length=-1):
|
||||
"""Read bytes from file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
length: int
|
||||
Read up to this many bytes. If negative, read all content to end of
|
||||
file. If the server has not supplied the filesize, attempting to
|
||||
read only part of the data will raise a ValueError.
|
||||
"""
|
||||
if (
|
||||
(length < 0 and self.loc == 0) # explicit read all
|
||||
# but not when the size is known and fits into a block anyways
|
||||
and not (self.size is not None and self.size <= self.blocksize)
|
||||
):
|
||||
self._fetch_all()
|
||||
if self.size is None:
|
||||
if length < 0:
|
||||
self._fetch_all()
|
||||
else:
|
||||
length = min(self.size - self.loc, length)
|
||||
return super().read(length)
|
||||
|
||||
async def async_fetch_all(self):
|
||||
"""Read whole file in one shot, without caching
|
||||
|
||||
This is only called when position is still at zero,
|
||||
and read() is called without a byte-count.
|
||||
"""
|
||||
logger.debug(f"Fetch all for {self}")
|
||||
if not isinstance(self.cache, AllBytes):
|
||||
r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
|
||||
async with r:
|
||||
r.raise_for_status()
|
||||
out = await r.read()
|
||||
self.cache = AllBytes(
|
||||
size=len(out), fetcher=None, blocksize=None, data=out
|
||||
)
|
||||
self.size = len(out)
|
||||
|
||||
_fetch_all = sync_wrapper(async_fetch_all)
|
||||
|
||||
def _parse_content_range(self, headers):
|
||||
"""Parse the Content-Range header"""
|
||||
s = headers.get("Content-Range", "")
|
||||
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
|
||||
if not m:
|
||||
return None, None, None
|
||||
|
||||
if m[1] == "*":
|
||||
start = end = None
|
||||
else:
|
||||
start, end = [int(x) for x in m[1].split("-")]
|
||||
total = None if m[2] == "*" else int(m[2])
|
||||
return start, end, total
|
||||
|
||||
async def async_fetch_range(self, start, end):
|
||||
"""Download a block of data
|
||||
|
||||
The expectation is that the server returns only the requested bytes,
|
||||
with HTTP code 206. If this is not the case, we first check the headers,
|
||||
and then stream the output - if the data size is bigger than we
|
||||
requested, an exception is raised.
|
||||
"""
|
||||
logger.debug(f"Fetch range for {self}: {start}-{end}")
|
||||
kwargs = self.kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
logger.debug(f"{self.url} : {headers['Range']}")
|
||||
r = await self.session.get(
|
||||
self.fs.encode_url(self.url), headers=headers, **kwargs
|
||||
)
|
||||
async with r:
|
||||
if r.status == 416:
|
||||
# range request outside file
|
||||
return b""
|
||||
r.raise_for_status()
|
||||
|
||||
# If the server has handled the range request, it should reply
|
||||
# with status 206 (partial content). But we'll guess that a suitable
|
||||
# Content-Range header or a Content-Length no more than the
|
||||
# requested range also mean we have got the desired range.
|
||||
response_is_range = (
|
||||
r.status == 206
|
||||
or self._parse_content_range(r.headers)[0] == start
|
||||
or int(r.headers.get("Content-Length", end + 1)) <= end - start
|
||||
)
|
||||
|
||||
if response_is_range:
|
||||
# partial content, as expected
|
||||
out = await r.read()
|
||||
elif start > 0:
|
||||
raise ValueError(
|
||||
"The HTTP server doesn't appear to support range requests. "
|
||||
"Only reading this file from the beginning is supported. "
|
||||
"Open with block_size=0 for a streaming file interface."
|
||||
)
|
||||
else:
|
||||
# Response is not a range, but we want the start of the file,
|
||||
# so we can read the required amount anyway.
|
||||
cl = 0
|
||||
out = []
|
||||
while True:
|
||||
chunk = await r.content.read(2**20)
|
||||
# data size unknown, let's read until we have enough
|
||||
if chunk:
|
||||
out.append(chunk)
|
||||
cl += len(chunk)
|
||||
if cl > end - start:
|
||||
break
|
||||
else:
|
||||
break
|
||||
out = b"".join(out)[: end - start]
|
||||
return out
|
||||
|
||||
_fetch_range = sync_wrapper(async_fetch_range)
|
||||
|
||||
|
||||
magic_check = re.compile("([*[])")
|
||||
|
||||
|
||||
def has_magic(s):
|
||||
match = magic_check.search(s)
|
||||
return match is not None
|
||||
|
||||
|
||||
class HTTPStreamFile(AbstractBufferedFile):
|
||||
def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
|
||||
self.asynchronous = kwargs.pop("asynchronous", False)
|
||||
self.url = url
|
||||
self.loop = loop
|
||||
self.session = session
|
||||
if mode != "rb":
|
||||
raise ValueError
|
||||
self.details = {"name": url, "size": None}
|
||||
super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
|
||||
|
||||
async def cor():
|
||||
r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
|
||||
self.fs._raise_not_found_for_status(r, url)
|
||||
return r
|
||||
|
||||
self.r = sync(self.loop, cor)
|
||||
self.loop = fs.loop
|
||||
|
||||
def seek(self, loc, whence=0):
|
||||
if loc == 0 and whence == 1:
|
||||
return
|
||||
if loc == self.loc and whence == 0:
|
||||
return
|
||||
raise ValueError("Cannot seek streaming HTTP file")
|
||||
|
||||
async def _read(self, num=-1):
|
||||
out = await self.r.content.read(num)
|
||||
self.loc += len(out)
|
||||
return out
|
||||
|
||||
read = sync_wrapper(_read)
|
||||
|
||||
async def _close(self):
|
||||
self.r.close()
|
||||
|
||||
def close(self):
|
||||
asyncio.run_coroutine_threadsafe(self._close(), self.loop)
|
||||
super().close()
|
||||
|
||||
|
||||
class AsyncStreamFile(AbstractAsyncStreamedFile):
|
||||
def __init__(
|
||||
self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
|
||||
):
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.r = None
|
||||
if mode != "rb":
|
||||
raise ValueError
|
||||
self.details = {"name": url, "size": None}
|
||||
self.kwargs = kwargs
|
||||
super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
|
||||
self.size = size
|
||||
|
||||
async def read(self, num=-1):
|
||||
if self.r is None:
|
||||
r = await self.session.get(
|
||||
self.fs.encode_url(self.url), **self.kwargs
|
||||
).__aenter__()
|
||||
self.fs._raise_not_found_for_status(r, self.url)
|
||||
self.r = r
|
||||
out = await self.r.content.read(num)
|
||||
self.loc += len(out)
|
||||
return out
|
||||
|
||||
async def close(self):
|
||||
if self.r is not None:
|
||||
self.r.close()
|
||||
self.r = None
|
||||
await super().close()
|
||||
|
||||
|
||||
async def get_range(session, url, start, end, file=None, **kwargs):
|
||||
# explicit get a range when we know it must be safe
|
||||
kwargs = kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
r = await session.get(url, headers=headers, **kwargs)
|
||||
r.raise_for_status()
|
||||
async with r:
|
||||
out = await r.read()
|
||||
if file:
|
||||
with open(file, "r+b") as f: # noqa: ASYNC101, ASYNC230
|
||||
f.seek(start)
|
||||
f.write(out)
|
||||
else:
|
||||
return out
|
||||
|
||||
|
||||
async def _file_info(url, session, size_policy="head", **kwargs):
|
||||
"""Call HEAD on the server to get details about the file (size/checksum etc.)
|
||||
|
||||
Default operation is to explicitly allow redirects and use encoding
|
||||
'identity' (no compression) to get the true size of the target.
|
||||
"""
|
||||
logger.debug("Retrieve file size for %s", url)
|
||||
kwargs = kwargs.copy()
|
||||
ar = kwargs.pop("allow_redirects", True)
|
||||
head = kwargs.get("headers", {}).copy()
|
||||
head["Accept-Encoding"] = "identity"
|
||||
kwargs["headers"] = head
|
||||
|
||||
info = {}
|
||||
if size_policy == "head":
|
||||
r = await session.head(url, allow_redirects=ar, **kwargs)
|
||||
elif size_policy == "get":
|
||||
r = await session.get(url, allow_redirects=ar, **kwargs)
|
||||
else:
|
||||
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
|
||||
async with r:
|
||||
r.raise_for_status()
|
||||
|
||||
if "Content-Length" in r.headers:
|
||||
# Some servers may choose to ignore Accept-Encoding and return
|
||||
# compressed content, in which case the returned size is unreliable.
|
||||
if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
|
||||
"identity",
|
||||
"",
|
||||
]:
|
||||
info["size"] = int(r.headers["Content-Length"])
|
||||
elif "Content-Range" in r.headers:
|
||||
info["size"] = int(r.headers["Content-Range"].split("/")[1])
|
||||
|
||||
if "Content-Type" in r.headers:
|
||||
info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
|
||||
|
||||
if r.headers.get("Accept-Ranges") == "none":
|
||||
# Some servers may explicitly discourage partial content requests, but
|
||||
# the lack of "Accept-Ranges" does not always indicate they would fail
|
||||
info["partial"] = False
|
||||
|
||||
info["url"] = str(r.url)
|
||||
|
||||
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
|
||||
if r.headers.get(checksum_field):
|
||||
info[checksum_field] = r.headers[checksum_field]
|
||||
|
||||
return info
|
||||
|
||||
|
||||
async def _file_size(url, session=None, *args, **kwargs):
|
||||
if session is None:
|
||||
session = await get_client()
|
||||
info = await _file_info(url, session=session, *args, **kwargs)
|
||||
return info.get("size")
|
||||
|
||||
|
||||
file_size = sync_wrapper(_file_size)
|
||||
@@ -0,0 +1,931 @@
|
||||
"""This file is largely copied from http.py"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
from copy import copy
|
||||
from json import dumps, loads
|
||||
from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
import yarl
|
||||
except (ImportError, ModuleNotFoundError, OSError):
|
||||
yarl = False
|
||||
|
||||
from fsspec.callbacks import _DEFAULT_CALLBACK
|
||||
from fsspec.registry import register_implementation
|
||||
from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
|
||||
|
||||
from ..caching import AllBytes
|
||||
|
||||
# https://stackoverflow.com/a/15926317/3821154
|
||||
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
|
||||
ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
|
||||
logger = logging.getLogger("fsspec.http")
|
||||
|
||||
|
||||
class JsHttpException(urllib.error.HTTPError): ...
|
||||
|
||||
|
||||
class StreamIO(io.BytesIO):
|
||||
# fake class, so you can set attributes on it
|
||||
# will eventually actually stream
|
||||
...
|
||||
|
||||
|
||||
class ResponseProxy:
|
||||
"""Looks like a requests response"""
|
||||
|
||||
def __init__(self, req, stream=False):
|
||||
self.request = req
|
||||
self.stream = stream
|
||||
self._data = None
|
||||
self._headers = None
|
||||
|
||||
@property
|
||||
def raw(self):
|
||||
if self._data is None:
|
||||
b = self.request.response.to_bytes()
|
||||
if self.stream:
|
||||
self._data = StreamIO(b)
|
||||
else:
|
||||
self._data = b
|
||||
return self._data
|
||||
|
||||
def close(self):
|
||||
if hasattr(self, "_data"):
|
||||
del self._data
|
||||
|
||||
@property
|
||||
def headers(self):
|
||||
if self._headers is None:
|
||||
self._headers = dict(
|
||||
[
|
||||
_.split(": ")
|
||||
for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
|
||||
]
|
||||
)
|
||||
return self._headers
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
return int(self.request.status)
|
||||
|
||||
def raise_for_status(self):
|
||||
if not self.ok:
|
||||
raise JsHttpException(
|
||||
self.url, self.status_code, self.reason, self.headers, None
|
||||
)
|
||||
|
||||
def iter_content(self, chunksize, *_, **__):
|
||||
while True:
|
||||
out = self.raw.read(chunksize)
|
||||
if out:
|
||||
yield out
|
||||
else:
|
||||
break
|
||||
|
||||
@property
|
||||
def reason(self):
|
||||
return self.request.statusText
|
||||
|
||||
@property
|
||||
def ok(self):
|
||||
return self.status_code < 400
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
return self.request.response.responseURL
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
# TODO: encoding from headers
|
||||
return self.content.decode()
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
self.stream = False
|
||||
return self.raw
|
||||
|
||||
def json(self):
|
||||
return loads(self.text)
|
||||
|
||||
|
||||
class RequestsSessionShim:
|
||||
def __init__(self):
|
||||
self.headers = {}
|
||||
|
||||
def request(
|
||||
self,
|
||||
method,
|
||||
url,
|
||||
params=None,
|
||||
data=None,
|
||||
headers=None,
|
||||
cookies=None,
|
||||
files=None,
|
||||
auth=None,
|
||||
timeout=None,
|
||||
allow_redirects=None,
|
||||
proxies=None,
|
||||
hooks=None,
|
||||
stream=None,
|
||||
verify=None,
|
||||
cert=None,
|
||||
json=None,
|
||||
):
|
||||
from js import Blob, XMLHttpRequest
|
||||
|
||||
logger.debug("JS request: %s %s", method, url)
|
||||
|
||||
if cert or verify or proxies or files or cookies or hooks:
|
||||
raise NotImplementedError
|
||||
if data and json:
|
||||
raise ValueError("Use json= or data=, not both")
|
||||
req = XMLHttpRequest.new()
|
||||
extra = auth if auth else ()
|
||||
if params:
|
||||
url = f"{url}?{urllib.parse.urlencode(params)}"
|
||||
req.open(method, url, False, *extra)
|
||||
if timeout:
|
||||
req.timeout = timeout
|
||||
if headers:
|
||||
for k, v in headers.items():
|
||||
req.setRequestHeader(k, v)
|
||||
|
||||
req.setRequestHeader("Accept", "application/octet-stream")
|
||||
req.responseType = "arraybuffer"
|
||||
if json:
|
||||
blob = Blob.new([dumps(data)], {type: "application/json"})
|
||||
req.send(blob)
|
||||
elif data:
|
||||
if isinstance(data, io.IOBase):
|
||||
data = data.read()
|
||||
blob = Blob.new([data], {type: "application/octet-stream"})
|
||||
req.send(blob)
|
||||
else:
|
||||
req.send(None)
|
||||
return ResponseProxy(req, stream=stream)
|
||||
|
||||
def get(self, url, **kwargs):
|
||||
return self.request("GET", url, **kwargs)
|
||||
|
||||
def head(self, url, **kwargs):
|
||||
return self.request("HEAD", url, **kwargs)
|
||||
|
||||
def post(self, url, **kwargs):
|
||||
return self.request("POST}", url, **kwargs)
|
||||
|
||||
def put(self, url, **kwargs):
|
||||
return self.request("PUT", url, **kwargs)
|
||||
|
||||
def patch(self, url, **kwargs):
|
||||
return self.request("PATCH", url, **kwargs)
|
||||
|
||||
def delete(self, url, **kwargs):
|
||||
return self.request("DELETE", url, **kwargs)
|
||||
|
||||
|
||||
class HTTPFileSystem(AbstractFileSystem):
|
||||
"""
|
||||
Simple File-System for fetching data via HTTP(S)
|
||||
|
||||
This is the BLOCKING version of the normal HTTPFileSystem. It uses
|
||||
requests in normal python and the JS runtime in pyodide.
|
||||
|
||||
***This implementation is extremely experimental, do not use unless
|
||||
you are testing pyodide/pyscript integration***
|
||||
"""
|
||||
|
||||
protocol = ("http", "https", "sync-http", "sync-https")
|
||||
sep = "/"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
simple_links=True,
|
||||
block_size=None,
|
||||
same_scheme=True,
|
||||
cache_type="readahead",
|
||||
cache_options=None,
|
||||
client_kwargs=None,
|
||||
encoded=False,
|
||||
**storage_options,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
block_size: int
|
||||
Blocks to read bytes; if 0, will default to raw requests file-like
|
||||
objects instead of HTTPFile instances
|
||||
simple_links: bool
|
||||
If True, will consider both HTML <a> tags and anything that looks
|
||||
like a URL; if False, will consider only the former.
|
||||
same_scheme: True
|
||||
When doing ls/glob, if this is True, only consider paths that have
|
||||
http/https matching the input URLs.
|
||||
size_policy: this argument is deprecated
|
||||
client_kwargs: dict
|
||||
Passed to aiohttp.ClientSession, see
|
||||
https://docs.aiohttp.org/en/stable/client_reference.html
|
||||
For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
|
||||
storage_options: key-value
|
||||
Any other parameters passed on to requests
|
||||
cache_type, cache_options: defaults used in open
|
||||
"""
|
||||
super().__init__(self, **storage_options)
|
||||
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
|
||||
self.simple_links = simple_links
|
||||
self.same_schema = same_scheme
|
||||
self.cache_type = cache_type
|
||||
self.cache_options = cache_options
|
||||
self.client_kwargs = client_kwargs or {}
|
||||
self.encoded = encoded
|
||||
self.kwargs = storage_options
|
||||
|
||||
try:
|
||||
import js # noqa: F401
|
||||
|
||||
logger.debug("Starting JS session")
|
||||
self.session = RequestsSessionShim()
|
||||
self.js = True
|
||||
except Exception as e:
|
||||
import requests
|
||||
|
||||
logger.debug("Starting cpython session because of: %s", e)
|
||||
self.session = requests.Session(**(client_kwargs or {}))
|
||||
self.js = False
|
||||
|
||||
request_options = copy(storage_options)
|
||||
self.use_listings_cache = request_options.pop("use_listings_cache", False)
|
||||
request_options.pop("listings_expiry_time", None)
|
||||
request_options.pop("max_paths", None)
|
||||
request_options.pop("skip_instance_cache", None)
|
||||
self.kwargs = request_options
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return "sync-http"
|
||||
|
||||
def encode_url(self, url):
|
||||
if yarl:
|
||||
return yarl.URL(url, encoded=self.encoded)
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path: str) -> str:
|
||||
"""For HTTP, we always want to keep the full URL"""
|
||||
path = path.replace("sync-http://", "http://").replace(
|
||||
"sync-https://", "https://"
|
||||
)
|
||||
return path
|
||||
|
||||
@classmethod
|
||||
def _parent(cls, path):
|
||||
# override, since _strip_protocol is different for URLs
|
||||
par = super()._parent(path)
|
||||
if len(par) > 7: # "http://..."
|
||||
return par
|
||||
return ""
|
||||
|
||||
def _ls_real(self, url, detail=True, **kwargs):
|
||||
# ignoring URL-encoded arguments
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
r = self.session.get(self.encode_url(url), **self.kwargs)
|
||||
self._raise_not_found_for_status(r, url)
|
||||
text = r.text
|
||||
if self.simple_links:
|
||||
links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
|
||||
else:
|
||||
links = [u[2] for u in ex.findall(text)]
|
||||
out = set()
|
||||
parts = urlparse(url)
|
||||
for l in links:
|
||||
if isinstance(l, tuple):
|
||||
l = l[1]
|
||||
if l.startswith("/") and len(l) > 1:
|
||||
# absolute URL on this server
|
||||
l = parts.scheme + "://" + parts.netloc + l
|
||||
if l.startswith("http"):
|
||||
if self.same_schema and l.startswith(url.rstrip("/") + "/"):
|
||||
out.add(l)
|
||||
elif l.replace("https", "http").startswith(
|
||||
url.replace("https", "http").rstrip("/") + "/"
|
||||
):
|
||||
# allowed to cross http <-> https
|
||||
out.add(l)
|
||||
else:
|
||||
if l not in ["..", "../"]:
|
||||
# Ignore FTP-like "parent"
|
||||
out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
|
||||
if not out and url.endswith("/"):
|
||||
out = self._ls_real(url.rstrip("/"), detail=False)
|
||||
if detail:
|
||||
return [
|
||||
{
|
||||
"name": u,
|
||||
"size": None,
|
||||
"type": "directory" if u.endswith("/") else "file",
|
||||
}
|
||||
for u in out
|
||||
]
|
||||
else:
|
||||
return sorted(out)
|
||||
|
||||
def ls(self, url, detail=True, **kwargs):
|
||||
if self.use_listings_cache and url in self.dircache:
|
||||
out = self.dircache[url]
|
||||
else:
|
||||
out = self._ls_real(url, detail=detail, **kwargs)
|
||||
self.dircache[url] = out
|
||||
return out
|
||||
|
||||
def _raise_not_found_for_status(self, response, url):
|
||||
"""
|
||||
Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
|
||||
"""
|
||||
if response.status_code == 404:
|
||||
raise FileNotFoundError(url)
|
||||
response.raise_for_status()
|
||||
|
||||
def cat_file(self, url, start=None, end=None, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(url)
|
||||
|
||||
if start is not None or end is not None:
|
||||
if start == end:
|
||||
return b""
|
||||
headers = kw.pop("headers", {}).copy()
|
||||
|
||||
headers["Range"] = self._process_limits(url, start, end)
|
||||
kw["headers"] = headers
|
||||
r = self.session.get(self.encode_url(url), **kw)
|
||||
self._raise_not_found_for_status(r, url)
|
||||
return r.content
|
||||
|
||||
def get_file(
|
||||
self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
|
||||
):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
logger.debug(rpath)
|
||||
r = self.session.get(self.encode_url(rpath), **kw)
|
||||
try:
|
||||
size = int(
|
||||
r.headers.get("content-length", None)
|
||||
or r.headers.get("Content-Length", None)
|
||||
)
|
||||
except (ValueError, KeyError, TypeError):
|
||||
size = None
|
||||
|
||||
callback.set_size(size)
|
||||
self._raise_not_found_for_status(r, rpath)
|
||||
if not isfilelike(lpath):
|
||||
lpath = open(lpath, "wb")
|
||||
for chunk in r.iter_content(chunk_size, decode_unicode=False):
|
||||
lpath.write(chunk)
|
||||
callback.relative_update(len(chunk))
|
||||
|
||||
def put_file(
|
||||
self,
|
||||
lpath,
|
||||
rpath,
|
||||
chunk_size=5 * 2**20,
|
||||
callback=_DEFAULT_CALLBACK,
|
||||
method="post",
|
||||
**kwargs,
|
||||
):
|
||||
def gen_chunks():
|
||||
# Support passing arbitrary file-like objects
|
||||
# and use them instead of streams.
|
||||
if isinstance(lpath, io.IOBase):
|
||||
context = nullcontext(lpath)
|
||||
use_seek = False # might not support seeking
|
||||
else:
|
||||
context = open(lpath, "rb")
|
||||
use_seek = True
|
||||
|
||||
with context as f:
|
||||
if use_seek:
|
||||
callback.set_size(f.seek(0, 2))
|
||||
f.seek(0)
|
||||
else:
|
||||
callback.set_size(getattr(f, "size", None))
|
||||
|
||||
chunk = f.read(chunk_size)
|
||||
while chunk:
|
||||
yield chunk
|
||||
callback.relative_update(len(chunk))
|
||||
chunk = f.read(chunk_size)
|
||||
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
|
||||
method = method.lower()
|
||||
if method not in ("post", "put"):
|
||||
raise ValueError(
|
||||
f"method has to be either 'post' or 'put', not: {method!r}"
|
||||
)
|
||||
|
||||
meth = getattr(self.session, method)
|
||||
resp = meth(rpath, data=gen_chunks(), **kw)
|
||||
self._raise_not_found_for_status(resp, rpath)
|
||||
|
||||
def _process_limits(self, url, start, end):
|
||||
"""Helper for "Range"-based _cat_file"""
|
||||
size = None
|
||||
suff = False
|
||||
if start is not None and start < 0:
|
||||
# if start is negative and end None, end is the "suffix length"
|
||||
if end is None:
|
||||
end = -start
|
||||
start = ""
|
||||
suff = True
|
||||
else:
|
||||
size = size or self.info(url)["size"]
|
||||
start = size + start
|
||||
elif start is None:
|
||||
start = 0
|
||||
if not suff:
|
||||
if end is not None and end < 0:
|
||||
if start is not None:
|
||||
size = size or self.info(url)["size"]
|
||||
end = size + end
|
||||
elif end is None:
|
||||
end = ""
|
||||
if isinstance(end, int):
|
||||
end -= 1 # bytes range is inclusive
|
||||
return f"bytes={start}-{end}"
|
||||
|
||||
def exists(self, path, **kwargs):
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
try:
|
||||
logger.debug(path)
|
||||
r = self.session.get(self.encode_url(path), **kw)
|
||||
return r.status_code < 400
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def isfile(self, path, **kwargs):
|
||||
return self.exists(path, **kwargs)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=None, # XXX: This differs from the base class.
|
||||
cache_type=None,
|
||||
cache_options=None,
|
||||
size=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Make a file-like object
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Full URL with protocol
|
||||
mode: string
|
||||
must be "rb"
|
||||
block_size: int or None
|
||||
Bytes to download in one request; use instance value if None. If
|
||||
zero, will return a streaming Requests file-like instance.
|
||||
kwargs: key-value
|
||||
Any other parameters, passed to requests calls
|
||||
"""
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
block_size = block_size if block_size is not None else self.block_size
|
||||
kw = self.kwargs.copy()
|
||||
kw.update(kwargs)
|
||||
size = size or self.info(path, **kwargs)["size"]
|
||||
if block_size and size:
|
||||
return HTTPFile(
|
||||
self,
|
||||
path,
|
||||
session=self.session,
|
||||
block_size=block_size,
|
||||
mode=mode,
|
||||
size=size,
|
||||
cache_type=cache_type or self.cache_type,
|
||||
cache_options=cache_options or self.cache_options,
|
||||
**kw,
|
||||
)
|
||||
else:
|
||||
return HTTPStreamFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
session=self.session,
|
||||
**kw,
|
||||
)
|
||||
|
||||
def ukey(self, url):
|
||||
"""Unique identifier; assume HTTP files are static, unchanging"""
|
||||
return tokenize(url, self.kwargs, self.protocol)
|
||||
|
||||
def info(self, url, **kwargs):
|
||||
"""Get info of URL
|
||||
|
||||
Tries to access location via HEAD, and then GET methods, but does
|
||||
not fetch the data.
|
||||
|
||||
It is possible that the server does not supply any size information, in
|
||||
which case size will be given as None (and certain operations on the
|
||||
corresponding file will not work).
|
||||
"""
|
||||
info = {}
|
||||
for policy in ["head", "get"]:
|
||||
try:
|
||||
info.update(
|
||||
_file_info(
|
||||
self.encode_url(url),
|
||||
size_policy=policy,
|
||||
session=self.session,
|
||||
**self.kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
if info.get("size") is not None:
|
||||
break
|
||||
except Exception as exc:
|
||||
if policy == "get":
|
||||
# If get failed, then raise a FileNotFoundError
|
||||
raise FileNotFoundError(url) from exc
|
||||
logger.debug(str(exc))
|
||||
|
||||
return {"name": url, "size": None, **info, "type": "file"}
|
||||
|
||||
def glob(self, path, maxdepth=None, **kwargs):
|
||||
"""
|
||||
Find files by glob-matching.
|
||||
|
||||
This implementation is idntical to the one in AbstractFileSystem,
|
||||
but "?" is not considered as a character for globbing, because it is
|
||||
so common in URLs, often identifying the "query" part.
|
||||
"""
|
||||
import re
|
||||
|
||||
ends = path.endswith("/")
|
||||
path = self._strip_protocol(path)
|
||||
indstar = path.find("*") if path.find("*") >= 0 else len(path)
|
||||
indbrace = path.find("[") if path.find("[") >= 0 else len(path)
|
||||
|
||||
ind = min(indstar, indbrace)
|
||||
|
||||
detail = kwargs.pop("detail", False)
|
||||
|
||||
if not has_magic(path):
|
||||
root = path
|
||||
depth = 1
|
||||
if ends:
|
||||
path += "/*"
|
||||
elif self.exists(path):
|
||||
if not detail:
|
||||
return [path]
|
||||
else:
|
||||
return {path: self.info(path)}
|
||||
else:
|
||||
if not detail:
|
||||
return [] # glob of non-existent returns empty
|
||||
else:
|
||||
return {}
|
||||
elif "/" in path[:ind]:
|
||||
ind2 = path[:ind].rindex("/")
|
||||
root = path[: ind2 + 1]
|
||||
depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
|
||||
else:
|
||||
root = ""
|
||||
depth = None if "**" in path else path[ind + 1 :].count("/") + 1
|
||||
|
||||
allpaths = self.find(
|
||||
root, maxdepth=maxdepth or depth, withdirs=True, detail=True, **kwargs
|
||||
)
|
||||
# Escape characters special to python regex, leaving our supported
|
||||
# special characters in place.
|
||||
# See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
|
||||
# for shell globbing details.
|
||||
pattern = (
|
||||
"^"
|
||||
+ (
|
||||
path.replace("\\", r"\\")
|
||||
.replace(".", r"\.")
|
||||
.replace("+", r"\+")
|
||||
.replace("//", "/")
|
||||
.replace("(", r"\(")
|
||||
.replace(")", r"\)")
|
||||
.replace("|", r"\|")
|
||||
.replace("^", r"\^")
|
||||
.replace("$", r"\$")
|
||||
.replace("{", r"\{")
|
||||
.replace("}", r"\}")
|
||||
.rstrip("/")
|
||||
)
|
||||
+ "$"
|
||||
)
|
||||
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
|
||||
pattern = re.sub("[*]", "[^/]*", pattern)
|
||||
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
|
||||
out = {
|
||||
p: allpaths[p]
|
||||
for p in sorted(allpaths)
|
||||
if pattern.match(p.replace("//", "/").rstrip("/"))
|
||||
}
|
||||
if detail:
|
||||
return out
|
||||
else:
|
||||
return list(out)
|
||||
|
||||
def isdir(self, path):
|
||||
# override, since all URLs are (also) files
|
||||
try:
|
||||
return bool(self.ls(path))
|
||||
except (FileNotFoundError, ValueError):
|
||||
return False
|
||||
|
||||
|
||||
class HTTPFile(AbstractBufferedFile):
|
||||
"""
|
||||
A file-like object pointing to a remove HTTP(S) resource
|
||||
|
||||
Supports only reading, with read-ahead of a predermined block-size.
|
||||
|
||||
In the case that the server does not supply the filesize, only reading of
|
||||
the complete file in one go is supported.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
Full URL of the remote resource, including the protocol
|
||||
session: requests.Session or None
|
||||
All calls will be made within this session, to avoid restarting
|
||||
connections where the server allows this
|
||||
block_size: int or None
|
||||
The amount of read-ahead to do, in bytes. Default is 5MB, or the value
|
||||
configured for the FileSystem creating this file
|
||||
size: None or int
|
||||
If given, this is the size of the file in bytes, and we don't attempt
|
||||
to call the server to find the value.
|
||||
kwargs: all other key-values are passed to requests calls.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fs,
|
||||
url,
|
||||
session=None,
|
||||
block_size=None,
|
||||
mode="rb",
|
||||
cache_type="bytes",
|
||||
cache_options=None,
|
||||
size=None,
|
||||
**kwargs,
|
||||
):
|
||||
if mode != "rb":
|
||||
raise NotImplementedError("File mode not supported")
|
||||
self.url = url
|
||||
self.session = session
|
||||
self.details = {"name": url, "size": size, "type": "file"}
|
||||
super().__init__(
|
||||
fs=fs,
|
||||
path=url,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
cache_type=cache_type,
|
||||
cache_options=cache_options,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def read(self, length=-1):
|
||||
"""Read bytes from file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
length: int
|
||||
Read up to this many bytes. If negative, read all content to end of
|
||||
file. If the server has not supplied the filesize, attempting to
|
||||
read only part of the data will raise a ValueError.
|
||||
"""
|
||||
if (
|
||||
(length < 0 and self.loc == 0) # explicit read all
|
||||
# but not when the size is known and fits into a block anyways
|
||||
and not (self.size is not None and self.size <= self.blocksize)
|
||||
):
|
||||
self._fetch_all()
|
||||
if self.size is None:
|
||||
if length < 0:
|
||||
self._fetch_all()
|
||||
else:
|
||||
length = min(self.size - self.loc, length)
|
||||
return super().read(length)
|
||||
|
||||
def _fetch_all(self):
|
||||
"""Read whole file in one shot, without caching
|
||||
|
||||
This is only called when position is still at zero,
|
||||
and read() is called without a byte-count.
|
||||
"""
|
||||
logger.debug(f"Fetch all for {self}")
|
||||
if not isinstance(self.cache, AllBytes):
|
||||
r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
|
||||
r.raise_for_status()
|
||||
out = r.content
|
||||
self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
|
||||
self.size = len(out)
|
||||
|
||||
def _parse_content_range(self, headers):
|
||||
"""Parse the Content-Range header"""
|
||||
s = headers.get("Content-Range", "")
|
||||
m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
|
||||
if not m:
|
||||
return None, None, None
|
||||
|
||||
if m[1] == "*":
|
||||
start = end = None
|
||||
else:
|
||||
start, end = [int(x) for x in m[1].split("-")]
|
||||
total = None if m[2] == "*" else int(m[2])
|
||||
return start, end, total
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
"""Download a block of data
|
||||
|
||||
The expectation is that the server returns only the requested bytes,
|
||||
with HTTP code 206. If this is not the case, we first check the headers,
|
||||
and then stream the output - if the data size is bigger than we
|
||||
requested, an exception is raised.
|
||||
"""
|
||||
logger.debug(f"Fetch range for {self}: {start}-{end}")
|
||||
kwargs = self.kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
logger.debug("%s : %s", self.url, headers["Range"])
|
||||
r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
|
||||
if r.status_code == 416:
|
||||
# range request outside file
|
||||
return b""
|
||||
r.raise_for_status()
|
||||
|
||||
# If the server has handled the range request, it should reply
|
||||
# with status 206 (partial content). But we'll guess that a suitable
|
||||
# Content-Range header or a Content-Length no more than the
|
||||
# requested range also mean we have got the desired range.
|
||||
cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
|
||||
response_is_range = (
|
||||
r.status_code == 206
|
||||
or self._parse_content_range(r.headers)[0] == start
|
||||
or int(cl) <= end - start
|
||||
)
|
||||
|
||||
if response_is_range:
|
||||
# partial content, as expected
|
||||
out = r.content
|
||||
elif start > 0:
|
||||
raise ValueError(
|
||||
"The HTTP server doesn't appear to support range requests. "
|
||||
"Only reading this file from the beginning is supported. "
|
||||
"Open with block_size=0 for a streaming file interface."
|
||||
)
|
||||
else:
|
||||
# Response is not a range, but we want the start of the file,
|
||||
# so we can read the required amount anyway.
|
||||
cl = 0
|
||||
out = []
|
||||
for chunk in r.iter_content(2**20, False):
|
||||
out.append(chunk)
|
||||
cl += len(chunk)
|
||||
out = b"".join(out)[: end - start]
|
||||
return out
|
||||
|
||||
|
||||
magic_check = re.compile("([*[])")
|
||||
|
||||
|
||||
def has_magic(s):
|
||||
match = magic_check.search(s)
|
||||
return match is not None
|
||||
|
||||
|
||||
class HTTPStreamFile(AbstractBufferedFile):
|
||||
def __init__(self, fs, url, mode="rb", session=None, **kwargs):
|
||||
self.url = url
|
||||
self.session = session
|
||||
if mode != "rb":
|
||||
raise ValueError
|
||||
self.details = {"name": url, "size": None}
|
||||
super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
|
||||
|
||||
r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
|
||||
self.fs._raise_not_found_for_status(r, url)
|
||||
self.it = r.iter_content(1024, False)
|
||||
self.leftover = b""
|
||||
|
||||
self.r = r
|
||||
|
||||
def seek(self, *args, **kwargs):
|
||||
raise ValueError("Cannot seek streaming HTTP file")
|
||||
|
||||
def read(self, num=-1):
|
||||
bufs = [self.leftover]
|
||||
leng = len(self.leftover)
|
||||
while leng < num or num < 0:
|
||||
try:
|
||||
out = self.it.__next__()
|
||||
except StopIteration:
|
||||
break
|
||||
if out:
|
||||
bufs.append(out)
|
||||
else:
|
||||
break
|
||||
leng += len(out)
|
||||
out = b"".join(bufs)
|
||||
if num >= 0:
|
||||
self.leftover = out[num:]
|
||||
out = out[:num]
|
||||
else:
|
||||
self.leftover = b""
|
||||
self.loc += len(out)
|
||||
return out
|
||||
|
||||
def close(self):
|
||||
self.r.close()
|
||||
self.closed = True
|
||||
|
||||
|
||||
def get_range(session, url, start, end, **kwargs):
|
||||
# explicit get a range when we know it must be safe
|
||||
kwargs = kwargs.copy()
|
||||
headers = kwargs.pop("headers", {}).copy()
|
||||
headers["Range"] = f"bytes={start}-{end - 1}"
|
||||
r = session.get(url, headers=headers, **kwargs)
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
|
||||
|
||||
def _file_info(url, session, size_policy="head", **kwargs):
|
||||
"""Call HEAD on the server to get details about the file (size/checksum etc.)
|
||||
|
||||
Default operation is to explicitly allow redirects and use encoding
|
||||
'identity' (no compression) to get the true size of the target.
|
||||
"""
|
||||
logger.debug("Retrieve file size for %s", url)
|
||||
kwargs = kwargs.copy()
|
||||
ar = kwargs.pop("allow_redirects", True)
|
||||
head = kwargs.get("headers", {}).copy()
|
||||
# TODO: not allowed in JS
|
||||
# head["Accept-Encoding"] = "identity"
|
||||
kwargs["headers"] = head
|
||||
|
||||
info = {}
|
||||
if size_policy == "head":
|
||||
r = session.head(url, allow_redirects=ar, **kwargs)
|
||||
elif size_policy == "get":
|
||||
r = session.get(url, allow_redirects=ar, **kwargs)
|
||||
else:
|
||||
raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
|
||||
r.raise_for_status()
|
||||
|
||||
# TODO:
|
||||
# recognise lack of 'Accept-Ranges',
|
||||
# or 'Accept-Ranges': 'none' (not 'bytes')
|
||||
# to mean streaming only, no random access => return None
|
||||
if "Content-Length" in r.headers:
|
||||
info["size"] = int(r.headers["Content-Length"])
|
||||
elif "Content-Range" in r.headers:
|
||||
info["size"] = int(r.headers["Content-Range"].split("/")[1])
|
||||
elif "content-length" in r.headers:
|
||||
info["size"] = int(r.headers["content-length"])
|
||||
elif "content-range" in r.headers:
|
||||
info["size"] = int(r.headers["content-range"].split("/")[1])
|
||||
|
||||
for checksum_field in ["ETag", "Content-MD5", "Digest"]:
|
||||
if r.headers.get(checksum_field):
|
||||
info[checksum_field] = r.headers[checksum_field]
|
||||
|
||||
return info
|
||||
|
||||
|
||||
# importing this is enough to register it
|
||||
def register():
|
||||
register_implementation("http", HTTPFileSystem, clobber=True)
|
||||
register_implementation("https", HTTPFileSystem, clobber=True)
|
||||
register_implementation("sync-http", HTTPFileSystem, clobber=True)
|
||||
register_implementation("sync-https", HTTPFileSystem, clobber=True)
|
||||
|
||||
|
||||
register()
|
||||
|
||||
|
||||
def unregister():
|
||||
from fsspec.implementations.http import HTTPFileSystem
|
||||
|
||||
register_implementation("http", HTTPFileSystem, clobber=True)
|
||||
register_implementation("https", HTTPFileSystem, clobber=True)
|
||||
@@ -0,0 +1,124 @@
|
||||
import base64
|
||||
import io
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
import fsspec
|
||||
|
||||
|
||||
class JupyterFileSystem(fsspec.AbstractFileSystem):
|
||||
"""View of the files as seen by a Jupyter server (notebook or lab)"""
|
||||
|
||||
protocol = ("jupyter", "jlab")
|
||||
|
||||
def __init__(self, url, tok=None, **kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
Base URL of the server, like "http://127.0.0.1:8888". May include
|
||||
token in the string, which is given by the process when starting up
|
||||
tok : str
|
||||
If the token is obtained separately, can be given here
|
||||
kwargs
|
||||
"""
|
||||
if "?" in url:
|
||||
if tok is None:
|
||||
try:
|
||||
tok = re.findall("token=([a-z0-9]+)", url)[0]
|
||||
except IndexError as e:
|
||||
raise ValueError("Could not determine token") from e
|
||||
url = url.split("?", 1)[0]
|
||||
self.url = url.rstrip("/") + "/api/contents"
|
||||
self.session = requests.Session()
|
||||
if tok:
|
||||
self.session.headers["Authorization"] = f"token {tok}"
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
r = self.session.get(f"{self.url}/{path}")
|
||||
if r.status_code == 404:
|
||||
return FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
out = r.json()
|
||||
|
||||
if out["type"] == "directory":
|
||||
out = out["content"]
|
||||
else:
|
||||
out = [out]
|
||||
for o in out:
|
||||
o["name"] = o.pop("path")
|
||||
o.pop("content")
|
||||
if o["type"] == "notebook":
|
||||
o["type"] = "file"
|
||||
if detail:
|
||||
return out
|
||||
return [o["name"] for o in out]
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
r = self.session.get(f"{self.url}/{path}")
|
||||
if r.status_code == 404:
|
||||
return FileNotFoundError(path)
|
||||
r.raise_for_status()
|
||||
out = r.json()
|
||||
if out["format"] == "text":
|
||||
# data should be binary
|
||||
b = out["content"].encode()
|
||||
else:
|
||||
b = base64.b64decode(out["content"])
|
||||
return b[start:end]
|
||||
|
||||
def pipe_file(self, path, value, **_):
|
||||
path = self._strip_protocol(path)
|
||||
json = {
|
||||
"name": path.rsplit("/", 1)[-1],
|
||||
"path": path,
|
||||
"size": len(value),
|
||||
"content": base64.b64encode(value).decode(),
|
||||
"format": "base64",
|
||||
"type": "file",
|
||||
}
|
||||
self.session.put(f"{self.url}/{path}", json=json)
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if create_parents and "/" in path:
|
||||
self.mkdir(path.rsplit("/", 1)[0], True)
|
||||
json = {
|
||||
"name": path.rsplit("/", 1)[-1],
|
||||
"path": path,
|
||||
"size": None,
|
||||
"content": None,
|
||||
"type": "directory",
|
||||
}
|
||||
self.session.put(f"{self.url}/{path}", json=json)
|
||||
|
||||
def _rm(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
self.session.delete(f"{self.url}/{path}")
|
||||
|
||||
def _open(self, path, mode="rb", **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if mode == "rb":
|
||||
data = self.cat_file(path)
|
||||
return io.BytesIO(data)
|
||||
else:
|
||||
return SimpleFileWriter(self, path, mode="wb")
|
||||
|
||||
|
||||
class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
|
||||
def _upload_chunk(self, final=False):
|
||||
"""Never uploads a chunk until file is done
|
||||
|
||||
Not suitable for large files
|
||||
"""
|
||||
if final is False:
|
||||
return False
|
||||
self.buffer.seek(0)
|
||||
data = self.buffer.read()
|
||||
self.fs.pipe_file(self.path, data)
|
||||
@@ -0,0 +1,213 @@
|
||||
from contextlib import contextmanager
|
||||
from ctypes import (
|
||||
CFUNCTYPE,
|
||||
POINTER,
|
||||
c_int,
|
||||
c_longlong,
|
||||
c_void_p,
|
||||
cast,
|
||||
create_string_buffer,
|
||||
)
|
||||
|
||||
import libarchive
|
||||
import libarchive.ffi as ffi
|
||||
|
||||
from fsspec import open_files
|
||||
from fsspec.archive import AbstractArchiveFileSystem
|
||||
from fsspec.implementations.memory import MemoryFile
|
||||
from fsspec.utils import DEFAULT_BLOCK_SIZE
|
||||
|
||||
# Libarchive requires seekable files or memory only for certain archive
|
||||
# types. However, since we read the directory first to cache the contents
|
||||
# and also allow random access to any file, the file-like object needs
|
||||
# to be seekable no matter what.
|
||||
|
||||
# Seek call-backs (not provided in the libarchive python wrapper)
|
||||
SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
|
||||
read_set_seek_callback = ffi.ffi(
|
||||
"read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
|
||||
)
|
||||
new_api = hasattr(ffi, "NO_OPEN_CB")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
|
||||
"""Read an archive from a seekable file-like object.
|
||||
|
||||
The `file` object must support the standard `readinto` and 'seek' methods.
|
||||
"""
|
||||
buf = create_string_buffer(block_size)
|
||||
buf_p = cast(buf, c_void_p)
|
||||
|
||||
def read_func(archive_p, context, ptrptr):
|
||||
# readinto the buffer, returns number of bytes read
|
||||
length = file.readinto(buf)
|
||||
# write the address of the buffer into the pointer
|
||||
ptrptr = cast(ptrptr, POINTER(c_void_p))
|
||||
ptrptr[0] = buf_p
|
||||
# tell libarchive how much data was written into the buffer
|
||||
return length
|
||||
|
||||
def seek_func(archive_p, context, offset, whence):
|
||||
file.seek(offset, whence)
|
||||
# tell libarchvie the current position
|
||||
return file.tell()
|
||||
|
||||
read_cb = ffi.READ_CALLBACK(read_func)
|
||||
seek_cb = SEEK_CALLBACK(seek_func)
|
||||
|
||||
if new_api:
|
||||
open_cb = ffi.NO_OPEN_CB
|
||||
close_cb = ffi.NO_CLOSE_CB
|
||||
else:
|
||||
open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
|
||||
close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
|
||||
|
||||
with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
|
||||
read_set_seek_callback(archive_p, seek_cb)
|
||||
ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
|
||||
yield libarchive.read.ArchiveRead(archive_p)
|
||||
|
||||
|
||||
class LibArchiveFileSystem(AbstractArchiveFileSystem):
|
||||
"""Compressed archives as a file-system (read-only)
|
||||
|
||||
Supports the following formats:
|
||||
tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
|
||||
Microsoft CAB, 7-Zip, WARC
|
||||
|
||||
See the libarchive documentation for further restrictions.
|
||||
https://www.libarchive.org/
|
||||
|
||||
Keeps file object open while instance lives. It only works in seekable
|
||||
file-like objects. In case the filesystem does not support this kind of
|
||||
file object, it is recommended to cache locally.
|
||||
|
||||
This class is pickleable, but not necessarily thread-safe (depends on the
|
||||
platform). See libarchive documentation for details.
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
protocol = "libarchive"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fo="",
|
||||
mode="r",
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
block_size=DEFAULT_BLOCK_SIZE,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
fo: str or file-like
|
||||
Contains ZIP, and must exist. If a str, will fetch file using
|
||||
:meth:`~fsspec.open_files`, which must return one file exactly.
|
||||
mode: str
|
||||
Currently, only 'r' accepted
|
||||
target_protocol: str (optional)
|
||||
If ``fo`` is a string, this value can be used to override the
|
||||
FS protocol inferred from a URL
|
||||
target_options: dict (optional)
|
||||
Kwargs passed when instantiating the target FS, if ``fo`` is
|
||||
a string.
|
||||
"""
|
||||
super().__init__(self, **kwargs)
|
||||
if mode != "r":
|
||||
raise ValueError("Only read from archive files accepted")
|
||||
if isinstance(fo, str):
|
||||
files = open_files(fo, protocol=target_protocol, **(target_options or {}))
|
||||
if len(files) != 1:
|
||||
raise ValueError(
|
||||
f'Path "{fo}" did not resolve to exactly one file: "{files}"'
|
||||
)
|
||||
fo = files[0]
|
||||
self.of = fo
|
||||
self.fo = fo.__enter__() # the whole instance is a context
|
||||
self.block_size = block_size
|
||||
self.dir_cache = None
|
||||
|
||||
@contextmanager
|
||||
def _open_archive(self):
|
||||
self.fo.seek(0)
|
||||
with custom_reader(self.fo, block_size=self.block_size) as arc:
|
||||
yield arc
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
# file paths are always relative to the archive root
|
||||
return super()._strip_protocol(path).lstrip("/")
|
||||
|
||||
def _get_dirs(self):
|
||||
fields = {
|
||||
"name": "pathname",
|
||||
"size": "size",
|
||||
"created": "ctime",
|
||||
"mode": "mode",
|
||||
"uid": "uid",
|
||||
"gid": "gid",
|
||||
"mtime": "mtime",
|
||||
}
|
||||
|
||||
if self.dir_cache is not None:
|
||||
return
|
||||
|
||||
self.dir_cache = {}
|
||||
list_names = []
|
||||
with self._open_archive() as arc:
|
||||
for entry in arc:
|
||||
if not entry.isdir and not entry.isfile:
|
||||
# Skip symbolic links, fifo entries, etc.
|
||||
continue
|
||||
self.dir_cache.update(
|
||||
{
|
||||
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||||
for dirname in self._all_dirnames(set(entry.name))
|
||||
}
|
||||
)
|
||||
f = {key: getattr(entry, fields[key]) for key in fields}
|
||||
f["type"] = "directory" if entry.isdir else "file"
|
||||
list_names.append(entry.name)
|
||||
|
||||
self.dir_cache[f["name"]] = f
|
||||
# libarchive does not seem to return an entry for the directories (at least
|
||||
# not in all formats), so get the directories names from the files names
|
||||
self.dir_cache.update(
|
||||
{
|
||||
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||||
for dirname in self._all_dirnames(list_names)
|
||||
}
|
||||
)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
if mode != "rb":
|
||||
raise NotImplementedError
|
||||
|
||||
data = bytes()
|
||||
with self._open_archive() as arc:
|
||||
for entry in arc:
|
||||
if entry.pathname != path:
|
||||
continue
|
||||
|
||||
if entry.size == 0:
|
||||
# empty file, so there are no blocks
|
||||
break
|
||||
|
||||
for block in entry.get_blocks(entry.size):
|
||||
data = block
|
||||
break
|
||||
else:
|
||||
raise ValueError
|
||||
return MemoryFile(fs=self, path=path, data=data)
|
||||
@@ -0,0 +1,477 @@
|
||||
import datetime
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import os.path as osp
|
||||
import shutil
|
||||
import stat
|
||||
import tempfile
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.compression import compr
|
||||
from fsspec.core import get_compression
|
||||
from fsspec.utils import isfilelike, stringify_path
|
||||
|
||||
logger = logging.getLogger("fsspec.local")
|
||||
|
||||
|
||||
class LocalFileSystem(AbstractFileSystem):
|
||||
"""Interface to files on local storage
|
||||
|
||||
Parameters
|
||||
----------
|
||||
auto_mkdir: bool
|
||||
Whether, when opening a file, the directory containing it should
|
||||
be created (if it doesn't already exist). This is assumed by pyarrow
|
||||
code.
|
||||
"""
|
||||
|
||||
root_marker = "/"
|
||||
protocol = "file", "local"
|
||||
local_file = True
|
||||
|
||||
def __init__(self, auto_mkdir=False, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.auto_mkdir = auto_mkdir
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return "local"
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self.exists(path):
|
||||
raise FileExistsError(path)
|
||||
if create_parents:
|
||||
self.makedirs(path, exist_ok=True)
|
||||
else:
|
||||
os.mkdir(path, **kwargs)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
path = self._strip_protocol(path)
|
||||
os.makedirs(path, exist_ok=exist_ok)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
os.rmdir(path)
|
||||
|
||||
def ls(self, path, detail=False, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
path_info = self.info(path)
|
||||
infos = []
|
||||
if path_info["type"] == "directory":
|
||||
with os.scandir(path) as it:
|
||||
for f in it:
|
||||
try:
|
||||
# Only get the info if requested since it is a bit expensive (the stat call inside)
|
||||
# The strip_protocol is also used in info() and calls make_path_posix to always return posix paths
|
||||
info = self.info(f) if detail else self._strip_protocol(f.path)
|
||||
infos.append(info)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
else:
|
||||
infos = [path_info] if detail else [path_info["name"]]
|
||||
|
||||
return infos
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
if isinstance(path, os.DirEntry):
|
||||
# scandir DirEntry
|
||||
out = path.stat(follow_symlinks=False)
|
||||
link = path.is_symlink()
|
||||
if path.is_dir(follow_symlinks=False):
|
||||
t = "directory"
|
||||
elif path.is_file(follow_symlinks=False):
|
||||
t = "file"
|
||||
else:
|
||||
t = "other"
|
||||
|
||||
size = out.st_size
|
||||
if link:
|
||||
try:
|
||||
out2 = path.stat(follow_symlinks=True)
|
||||
size = out2.st_size
|
||||
except OSError:
|
||||
size = 0
|
||||
path = self._strip_protocol(path.path)
|
||||
else:
|
||||
# str or path-like
|
||||
path = self._strip_protocol(path)
|
||||
out = os.stat(path, follow_symlinks=False)
|
||||
link = stat.S_ISLNK(out.st_mode)
|
||||
if link:
|
||||
out = os.stat(path, follow_symlinks=True)
|
||||
size = out.st_size
|
||||
if stat.S_ISDIR(out.st_mode):
|
||||
t = "directory"
|
||||
elif stat.S_ISREG(out.st_mode):
|
||||
t = "file"
|
||||
else:
|
||||
t = "other"
|
||||
result = {
|
||||
"name": path,
|
||||
"size": size,
|
||||
"type": t,
|
||||
"created": out.st_ctime,
|
||||
"islink": link,
|
||||
}
|
||||
for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
|
||||
result[field] = getattr(out, f"st_{field}")
|
||||
if link:
|
||||
result["destination"] = os.readlink(path)
|
||||
return result
|
||||
|
||||
def lexists(self, path, **kwargs):
|
||||
return osp.lexists(path)
|
||||
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
if self.auto_mkdir:
|
||||
self.makedirs(self._parent(path2), exist_ok=True)
|
||||
if self.isfile(path1):
|
||||
shutil.copyfile(path1, path2)
|
||||
elif self.isdir(path1):
|
||||
self.mkdirs(path2, exist_ok=True)
|
||||
else:
|
||||
raise FileNotFoundError(path1)
|
||||
|
||||
def isfile(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return os.path.isfile(path)
|
||||
|
||||
def isdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return os.path.isdir(path)
|
||||
|
||||
def get_file(self, path1, path2, callback=None, **kwargs):
|
||||
if isfilelike(path2):
|
||||
with open(path1, "rb") as f:
|
||||
shutil.copyfileobj(f, path2)
|
||||
else:
|
||||
return self.cp_file(path1, path2, **kwargs)
|
||||
|
||||
def put_file(self, path1, path2, callback=None, **kwargs):
|
||||
return self.cp_file(path1, path2, **kwargs)
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
shutil.move(path1, path2)
|
||||
|
||||
def link(self, src, dst, **kwargs):
|
||||
src = self._strip_protocol(src)
|
||||
dst = self._strip_protocol(dst)
|
||||
os.link(src, dst, **kwargs)
|
||||
|
||||
def symlink(self, src, dst, **kwargs):
|
||||
src = self._strip_protocol(src)
|
||||
dst = self._strip_protocol(dst)
|
||||
os.symlink(src, dst, **kwargs)
|
||||
|
||||
def islink(self, path) -> bool:
|
||||
return os.path.islink(self._strip_protocol(path))
|
||||
|
||||
def rm_file(self, path):
|
||||
os.remove(self._strip_protocol(path))
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
if not isinstance(path, list):
|
||||
path = [path]
|
||||
|
||||
for p in path:
|
||||
p = self._strip_protocol(p)
|
||||
if self.isdir(p):
|
||||
if not recursive:
|
||||
raise ValueError("Cannot delete directory, set recursive=True")
|
||||
if osp.abspath(p) == os.getcwd():
|
||||
raise ValueError("Cannot delete current working directory")
|
||||
shutil.rmtree(p)
|
||||
else:
|
||||
os.remove(p)
|
||||
|
||||
def unstrip_protocol(self, name):
|
||||
name = self._strip_protocol(name) # normalise for local/win/...
|
||||
return f"file://{name}"
|
||||
|
||||
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self.auto_mkdir and "w" in mode:
|
||||
self.makedirs(self._parent(path), exist_ok=True)
|
||||
return LocalFileOpener(path, mode, fs=self, **kwargs)
|
||||
|
||||
def touch(self, path, truncate=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if self.auto_mkdir:
|
||||
self.makedirs(self._parent(path), exist_ok=True)
|
||||
if self.exists(path):
|
||||
os.utime(path, None)
|
||||
else:
|
||||
open(path, "a").close()
|
||||
if truncate:
|
||||
os.truncate(path, 0)
|
||||
|
||||
def created(self, path):
|
||||
info = self.info(path=path)
|
||||
return datetime.datetime.fromtimestamp(
|
||||
info["created"], tz=datetime.timezone.utc
|
||||
)
|
||||
|
||||
def modified(self, path):
|
||||
info = self.info(path=path)
|
||||
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
||||
|
||||
@classmethod
|
||||
def _parent(cls, path):
|
||||
path = cls._strip_protocol(path)
|
||||
if os.sep == "/":
|
||||
# posix native
|
||||
return path.rsplit("/", 1)[0] or "/"
|
||||
else:
|
||||
# NT
|
||||
path_ = path.rsplit("/", 1)[0]
|
||||
if len(path_) <= 3:
|
||||
if path_[1:2] == ":":
|
||||
# nt root (something like c:/)
|
||||
return path_[0] + ":/"
|
||||
# More cases may be required here
|
||||
return path_
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
path = stringify_path(path)
|
||||
if path.startswith("file://"):
|
||||
path = path[7:]
|
||||
elif path.startswith("file:"):
|
||||
path = path[5:]
|
||||
elif path.startswith("local://"):
|
||||
path = path[8:]
|
||||
elif path.startswith("local:"):
|
||||
path = path[6:]
|
||||
|
||||
path = make_path_posix(path)
|
||||
if os.sep != "/":
|
||||
# This code-path is a stripped down version of
|
||||
# > drive, path = ntpath.splitdrive(path)
|
||||
if path[1:2] == ":":
|
||||
# Absolute drive-letter path, e.g. X:\Windows
|
||||
# Relative path with drive, e.g. X:Windows
|
||||
drive, path = path[:2], path[2:]
|
||||
elif path[:2] == "//":
|
||||
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
|
||||
# Device drives, e.g. \\.\device or \\?\device
|
||||
if (index1 := path.find("/", 2)) == -1 or (
|
||||
index2 := path.find("/", index1 + 1)
|
||||
) == -1:
|
||||
drive, path = path, ""
|
||||
else:
|
||||
drive, path = path[:index2], path[index2:]
|
||||
else:
|
||||
# Relative path, e.g. Windows
|
||||
drive = ""
|
||||
|
||||
path = path.rstrip("/") or cls.root_marker
|
||||
return drive + path
|
||||
|
||||
else:
|
||||
return path.rstrip("/") or cls.root_marker
|
||||
|
||||
def _isfilestore(self):
|
||||
# Inheriting from DaskFileSystem makes this False (S3, etc. were)
|
||||
# the original motivation. But we are a posix-like file system.
|
||||
# See https://github.com/dask/dask/issues/5526
|
||||
return True
|
||||
|
||||
def chmod(self, path, mode):
|
||||
path = stringify_path(path)
|
||||
return os.chmod(path, mode)
|
||||
|
||||
|
||||
def make_path_posix(path):
|
||||
"""Make path generic and absolute for current OS"""
|
||||
if not isinstance(path, str):
|
||||
if isinstance(path, (list, set, tuple)):
|
||||
return type(path)(make_path_posix(p) for p in path)
|
||||
else:
|
||||
path = stringify_path(path)
|
||||
if not isinstance(path, str):
|
||||
raise TypeError(f"could not convert {path!r} to string")
|
||||
if os.sep == "/":
|
||||
# Native posix
|
||||
if path.startswith("/"):
|
||||
# most common fast case for posix
|
||||
return path
|
||||
elif path.startswith("~"):
|
||||
return osp.expanduser(path)
|
||||
elif path.startswith("./"):
|
||||
path = path[2:]
|
||||
elif path == ".":
|
||||
path = ""
|
||||
return f"{os.getcwd()}/{path}"
|
||||
else:
|
||||
# NT handling
|
||||
if path[0:1] == "/" and path[2:3] == ":":
|
||||
# path is like "/c:/local/path"
|
||||
path = path[1:]
|
||||
if path[1:2] == ":":
|
||||
# windows full path like "C:\\local\\path"
|
||||
if len(path) <= 3:
|
||||
# nt root (something like c:/)
|
||||
return path[0] + ":/"
|
||||
path = path.replace("\\", "/")
|
||||
return path
|
||||
elif path[0:1] == "~":
|
||||
return make_path_posix(osp.expanduser(path))
|
||||
elif path.startswith(("\\\\", "//")):
|
||||
# windows UNC/DFS-style paths
|
||||
return "//" + path[2:].replace("\\", "/")
|
||||
elif path.startswith(("\\", "/")):
|
||||
# windows relative path with root
|
||||
path = path.replace("\\", "/")
|
||||
return f"{osp.splitdrive(os.getcwd())[0]}{path}"
|
||||
else:
|
||||
path = path.replace("\\", "/")
|
||||
if path.startswith("./"):
|
||||
path = path[2:]
|
||||
elif path == ".":
|
||||
path = ""
|
||||
return f"{make_path_posix(os.getcwd())}/{path}"
|
||||
|
||||
|
||||
def trailing_sep(path):
|
||||
"""Return True if the path ends with a path separator.
|
||||
|
||||
A forward slash is always considered a path separator, even on Operating
|
||||
Systems that normally use a backslash.
|
||||
"""
|
||||
# TODO: if all incoming paths were posix-compliant then separator would
|
||||
# always be a forward slash, simplifying this function.
|
||||
# See https://github.com/fsspec/filesystem_spec/pull/1250
|
||||
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
|
||||
|
||||
|
||||
class LocalFileOpener(io.IOBase):
|
||||
def __init__(
|
||||
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
|
||||
):
|
||||
logger.debug("open file: %s", path)
|
||||
self.path = path
|
||||
self.mode = mode
|
||||
self.fs = fs
|
||||
self.f = None
|
||||
self.autocommit = autocommit
|
||||
self.compression = get_compression(path, compression)
|
||||
self.blocksize = io.DEFAULT_BUFFER_SIZE
|
||||
self._open()
|
||||
|
||||
def _open(self):
|
||||
if self.f is None or self.f.closed:
|
||||
if self.autocommit or "w" not in self.mode:
|
||||
self.f = open(self.path, mode=self.mode)
|
||||
if self.compression:
|
||||
compress = compr[self.compression]
|
||||
self.f = compress(self.f, mode=self.mode)
|
||||
else:
|
||||
# TODO: check if path is writable?
|
||||
i, name = tempfile.mkstemp()
|
||||
os.close(i) # we want normal open and normal buffered file
|
||||
self.temp = name
|
||||
self.f = open(name, mode=self.mode)
|
||||
if "w" not in self.mode:
|
||||
self.size = self.f.seek(0, 2)
|
||||
self.f.seek(0)
|
||||
self.f.size = self.size
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
# probably only used by cached FS
|
||||
if "r" not in self.mode:
|
||||
raise ValueError
|
||||
self._open()
|
||||
self.f.seek(start)
|
||||
return self.f.read(end - start)
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.f = None
|
||||
loc = state.pop("loc", None)
|
||||
self.__dict__.update(state)
|
||||
if "r" in state["mode"]:
|
||||
self.f = None
|
||||
self._open()
|
||||
self.f.seek(loc)
|
||||
|
||||
def __getstate__(self):
|
||||
d = self.__dict__.copy()
|
||||
d.pop("f")
|
||||
if "r" in self.mode:
|
||||
d["loc"] = self.f.tell()
|
||||
else:
|
||||
if not self.f.closed:
|
||||
raise ValueError("Cannot serialise open write-mode local file")
|
||||
return d
|
||||
|
||||
def commit(self):
|
||||
if self.autocommit:
|
||||
raise RuntimeError("Can only commit if not already set to autocommit")
|
||||
shutil.move(self.temp, self.path)
|
||||
|
||||
def discard(self):
|
||||
if self.autocommit:
|
||||
raise RuntimeError("Cannot discard if set to autocommit")
|
||||
os.remove(self.temp)
|
||||
|
||||
def readable(self) -> bool:
|
||||
return True
|
||||
|
||||
def writable(self) -> bool:
|
||||
return "r" not in self.mode
|
||||
|
||||
def read(self, *args, **kwargs):
|
||||
return self.f.read(*args, **kwargs)
|
||||
|
||||
def write(self, *args, **kwargs):
|
||||
return self.f.write(*args, **kwargs)
|
||||
|
||||
def tell(self, *args, **kwargs):
|
||||
return self.f.tell(*args, **kwargs)
|
||||
|
||||
def seek(self, *args, **kwargs):
|
||||
return self.f.seek(*args, **kwargs)
|
||||
|
||||
def seekable(self, *args, **kwargs):
|
||||
return self.f.seekable(*args, **kwargs)
|
||||
|
||||
def readline(self, *args, **kwargs):
|
||||
return self.f.readline(*args, **kwargs)
|
||||
|
||||
def readlines(self, *args, **kwargs):
|
||||
return self.f.readlines(*args, **kwargs)
|
||||
|
||||
def close(self):
|
||||
return self.f.close()
|
||||
|
||||
def truncate(self, size=None) -> int:
|
||||
return self.f.truncate(size)
|
||||
|
||||
@property
|
||||
def closed(self):
|
||||
return self.f.closed
|
||||
|
||||
def fileno(self):
|
||||
return self.raw.fileno()
|
||||
|
||||
def flush(self) -> None:
|
||||
self.f.flush()
|
||||
|
||||
def __iter__(self):
|
||||
return self.f.__iter__()
|
||||
|
||||
def __getattr__(self, item):
|
||||
return getattr(self.f, item)
|
||||
|
||||
def __enter__(self):
|
||||
self._incontext = True
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._incontext = False
|
||||
self.f.__exit__(exc_type, exc_value, traceback)
|
||||
@@ -0,0 +1,312 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from errno import ENOTEMPTY
|
||||
from io import BytesIO
|
||||
from pathlib import PurePath, PureWindowsPath
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from fsspec.implementations.local import LocalFileSystem
|
||||
from fsspec.utils import stringify_path
|
||||
|
||||
logger = logging.getLogger("fsspec.memoryfs")
|
||||
|
||||
|
||||
class MemoryFileSystem(AbstractFileSystem):
|
||||
"""A filesystem based on a dict of BytesIO objects
|
||||
|
||||
This is a global filesystem so instances of this class all point to the same
|
||||
in memory filesystem.
|
||||
"""
|
||||
|
||||
store: ClassVar[dict[str, Any]] = {} # global, do not overwrite!
|
||||
pseudo_dirs = [""] # global, do not overwrite!
|
||||
protocol = "memory"
|
||||
root_marker = "/"
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
if isinstance(path, PurePath):
|
||||
if isinstance(path, PureWindowsPath):
|
||||
return LocalFileSystem._strip_protocol(path)
|
||||
else:
|
||||
path = stringify_path(path)
|
||||
|
||||
if path.startswith("memory://"):
|
||||
path = path[len("memory://") :]
|
||||
if "::" in path or "://" in path:
|
||||
return path.rstrip("/")
|
||||
path = path.lstrip("/").rstrip("/")
|
||||
return "/" + path if path else ""
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if path in self.store:
|
||||
# there is a key with this exact name
|
||||
if not detail:
|
||||
return [path]
|
||||
return [
|
||||
{
|
||||
"name": path,
|
||||
"size": self.store[path].size,
|
||||
"type": "file",
|
||||
"created": self.store[path].created.timestamp(),
|
||||
}
|
||||
]
|
||||
paths = set()
|
||||
starter = path + "/"
|
||||
out = []
|
||||
for p2 in tuple(self.store):
|
||||
if p2.startswith(starter):
|
||||
if "/" not in p2[len(starter) :]:
|
||||
# exact child
|
||||
out.append(
|
||||
{
|
||||
"name": p2,
|
||||
"size": self.store[p2].size,
|
||||
"type": "file",
|
||||
"created": self.store[p2].created.timestamp(),
|
||||
}
|
||||
)
|
||||
elif len(p2) > len(starter):
|
||||
# implied child directory
|
||||
ppath = starter + p2[len(starter) :].split("/", 1)[0]
|
||||
if ppath not in paths:
|
||||
out = out or []
|
||||
out.append(
|
||||
{
|
||||
"name": ppath,
|
||||
"size": 0,
|
||||
"type": "directory",
|
||||
}
|
||||
)
|
||||
paths.add(ppath)
|
||||
for p2 in self.pseudo_dirs:
|
||||
if p2.startswith(starter):
|
||||
if "/" not in p2[len(starter) :]:
|
||||
# exact child pdir
|
||||
if p2 not in paths:
|
||||
out.append({"name": p2, "size": 0, "type": "directory"})
|
||||
paths.add(p2)
|
||||
else:
|
||||
# directory implied by deeper pdir
|
||||
ppath = starter + p2[len(starter) :].split("/", 1)[0]
|
||||
if ppath not in paths:
|
||||
out.append({"name": ppath, "size": 0, "type": "directory"})
|
||||
paths.add(ppath)
|
||||
if not out:
|
||||
if path in self.pseudo_dirs:
|
||||
# empty dir
|
||||
return []
|
||||
raise FileNotFoundError(path)
|
||||
if detail:
|
||||
return out
|
||||
return sorted([f["name"] for f in out])
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
path = self._strip_protocol(path)
|
||||
if path in self.store or path in self.pseudo_dirs:
|
||||
raise FileExistsError(path)
|
||||
if self._parent(path).strip("/") and self.isfile(self._parent(path)):
|
||||
raise NotADirectoryError(self._parent(path))
|
||||
if create_parents and self._parent(path).strip("/"):
|
||||
try:
|
||||
self.mkdir(self._parent(path), create_parents, **kwargs)
|
||||
except FileExistsError:
|
||||
pass
|
||||
if path and path not in self.pseudo_dirs:
|
||||
self.pseudo_dirs.append(path)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
try:
|
||||
self.mkdir(path, create_parents=True)
|
||||
except FileExistsError:
|
||||
if not exist_ok:
|
||||
raise
|
||||
|
||||
def pipe_file(self, path, value, mode="overwrite", **kwargs):
|
||||
"""Set the bytes of given file
|
||||
|
||||
Avoids copies of the data if possible
|
||||
"""
|
||||
mode = "xb" if mode == "create" else "wb"
|
||||
self.open(path, mode=mode, data=value)
|
||||
|
||||
def rmdir(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
if path == "":
|
||||
# silently avoid deleting FS root
|
||||
return
|
||||
if path in self.pseudo_dirs:
|
||||
if not self.ls(path):
|
||||
self.pseudo_dirs.remove(path)
|
||||
else:
|
||||
raise OSError(ENOTEMPTY, "Directory not empty", path)
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
logger.debug("info: %s", path)
|
||||
path = self._strip_protocol(path)
|
||||
if path in self.pseudo_dirs or any(
|
||||
p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
|
||||
):
|
||||
return {
|
||||
"name": path,
|
||||
"size": 0,
|
||||
"type": "directory",
|
||||
}
|
||||
elif path in self.store:
|
||||
filelike = self.store[path]
|
||||
return {
|
||||
"name": path,
|
||||
"size": filelike.size,
|
||||
"type": "file",
|
||||
"created": getattr(filelike, "created", None),
|
||||
}
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
if "x" in mode and self.exists(path):
|
||||
raise FileExistsError
|
||||
if path in self.pseudo_dirs:
|
||||
raise IsADirectoryError(path)
|
||||
parent = path
|
||||
while len(parent) > 1:
|
||||
parent = self._parent(parent)
|
||||
if self.isfile(parent):
|
||||
raise FileExistsError(parent)
|
||||
if mode in ["rb", "ab", "r+b"]:
|
||||
if path in self.store:
|
||||
f = self.store[path]
|
||||
if mode == "ab":
|
||||
# position at the end of file
|
||||
f.seek(0, 2)
|
||||
else:
|
||||
# position at the beginning of file
|
||||
f.seek(0)
|
||||
return f
|
||||
else:
|
||||
raise FileNotFoundError(path)
|
||||
elif mode in {"wb", "xb"}:
|
||||
if mode == "xb" and self.exists(path):
|
||||
raise FileExistsError
|
||||
m = MemoryFile(self, path, kwargs.get("data"))
|
||||
if not self._intrans:
|
||||
m.commit()
|
||||
return m
|
||||
else:
|
||||
name = self.__class__.__name__
|
||||
raise ValueError(f"unsupported file mode for {name}: {mode!r}")
|
||||
|
||||
def cp_file(self, path1, path2, **kwargs):
|
||||
path1 = self._strip_protocol(path1)
|
||||
path2 = self._strip_protocol(path2)
|
||||
if self.isfile(path1):
|
||||
self.store[path2] = MemoryFile(
|
||||
self, path2, self.store[path1].getvalue()
|
||||
) # implicit copy
|
||||
elif self.isdir(path1):
|
||||
if path2 not in self.pseudo_dirs:
|
||||
self.pseudo_dirs.append(path2)
|
||||
else:
|
||||
raise FileNotFoundError(path1)
|
||||
|
||||
def cat_file(self, path, start=None, end=None, **kwargs):
|
||||
logger.debug("cat: %s", path)
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
return bytes(self.store[path].getbuffer()[start:end])
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def _rm(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
del self.store[path]
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def modified(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
return self.store[path].modified
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def created(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
try:
|
||||
return self.store[path].created
|
||||
except KeyError as e:
|
||||
raise FileNotFoundError(path) from e
|
||||
|
||||
def isfile(self, path):
|
||||
path = self._strip_protocol(path)
|
||||
return path in self.store
|
||||
|
||||
def rm(self, path, recursive=False, maxdepth=None):
|
||||
if isinstance(path, str):
|
||||
path = self._strip_protocol(path)
|
||||
else:
|
||||
path = [self._strip_protocol(p) for p in path]
|
||||
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
|
||||
for p in reversed(paths):
|
||||
if self.isfile(p):
|
||||
self.rm_file(p)
|
||||
# If the expanded path doesn't exist, it is only because the expanded
|
||||
# path was a directory that does not exist in self.pseudo_dirs. This
|
||||
# is possible if you directly create files without making the
|
||||
# directories first.
|
||||
elif not self.exists(p):
|
||||
continue
|
||||
else:
|
||||
self.rmdir(p)
|
||||
|
||||
|
||||
class MemoryFile(BytesIO):
|
||||
"""A BytesIO which can't close and works as a context manager
|
||||
|
||||
Can initialise with data. Each path should only be active once at any moment.
|
||||
|
||||
No need to provide fs, path if auto-committing (default)
|
||||
"""
|
||||
|
||||
def __init__(self, fs=None, path=None, data=None):
|
||||
logger.debug("open file %s", path)
|
||||
self.fs = fs
|
||||
self.path = path
|
||||
self.created = datetime.now(tz=timezone.utc)
|
||||
self.modified = datetime.now(tz=timezone.utc)
|
||||
if data:
|
||||
super().__init__(data)
|
||||
self.seek(0)
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return self.getbuffer().nbytes
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def discard(self):
|
||||
pass
|
||||
|
||||
def commit(self):
|
||||
self.fs.store[self.path] = self
|
||||
self.modified = datetime.now(tz=timezone.utc)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,180 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import types
|
||||
import uuid
|
||||
from stat import S_ISDIR, S_ISLNK
|
||||
|
||||
import paramiko
|
||||
|
||||
from .. import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
|
||||
logger = logging.getLogger("fsspec.sftp")
|
||||
|
||||
|
||||
class SFTPFileSystem(AbstractFileSystem):
|
||||
"""Files over SFTP/SSH
|
||||
|
||||
Peer-to-peer filesystem over SSH using paramiko.
|
||||
|
||||
Note: if using this with the ``open`` or ``open_files``, with full URLs,
|
||||
there is no way to tell if a path is relative, so all paths are assumed
|
||||
to be absolute.
|
||||
"""
|
||||
|
||||
protocol = "sftp", "ssh"
|
||||
|
||||
def __init__(self, host, **ssh_kwargs):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
Hostname or IP as a string
|
||||
temppath: str
|
||||
Location on the server to put files, when within a transaction
|
||||
ssh_kwargs: dict
|
||||
Parameters passed on to connection. See details in
|
||||
https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
|
||||
May include port, username, password...
|
||||
"""
|
||||
if self._cached:
|
||||
return
|
||||
super().__init__(**ssh_kwargs)
|
||||
self.temppath = ssh_kwargs.pop("temppath", "/tmp") # remote temp directory
|
||||
self.host = host
|
||||
self.ssh_kwargs = ssh_kwargs
|
||||
self._connect()
|
||||
|
||||
def _connect(self):
|
||||
logger.debug("Connecting to SFTP server %s", self.host)
|
||||
self.client = paramiko.SSHClient()
|
||||
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
self.client.connect(self.host, **self.ssh_kwargs)
|
||||
self.ftp = self.client.open_sftp()
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return infer_storage_options(path)["path"]
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(urlpath):
|
||||
out = infer_storage_options(urlpath)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
return out
|
||||
|
||||
def mkdir(self, path, create_parents=True, mode=511):
|
||||
logger.debug("Creating folder %s", path)
|
||||
if self.exists(path):
|
||||
raise FileExistsError(f"File exists: {path}")
|
||||
|
||||
if create_parents:
|
||||
self.makedirs(path)
|
||||
else:
|
||||
self.ftp.mkdir(path, mode)
|
||||
|
||||
def makedirs(self, path, exist_ok=False, mode=511):
|
||||
if self.exists(path) and not exist_ok:
|
||||
raise FileExistsError(f"File exists: {path}")
|
||||
|
||||
parts = path.split("/")
|
||||
new_path = "/" if path[:1] == "/" else ""
|
||||
|
||||
for part in parts:
|
||||
if part:
|
||||
new_path = f"{new_path}/{part}" if new_path else part
|
||||
if not self.exists(new_path):
|
||||
self.ftp.mkdir(new_path, mode)
|
||||
|
||||
def rmdir(self, path):
|
||||
logger.debug("Removing folder %s", path)
|
||||
self.ftp.rmdir(path)
|
||||
|
||||
def info(self, path):
|
||||
stat = self._decode_stat(self.ftp.stat(path))
|
||||
stat["name"] = path
|
||||
return stat
|
||||
|
||||
@staticmethod
|
||||
def _decode_stat(stat, parent_path=None):
|
||||
if S_ISDIR(stat.st_mode):
|
||||
t = "directory"
|
||||
elif S_ISLNK(stat.st_mode):
|
||||
t = "link"
|
||||
else:
|
||||
t = "file"
|
||||
out = {
|
||||
"name": "",
|
||||
"size": stat.st_size,
|
||||
"type": t,
|
||||
"uid": stat.st_uid,
|
||||
"gid": stat.st_gid,
|
||||
"time": datetime.datetime.fromtimestamp(
|
||||
stat.st_atime, tz=datetime.timezone.utc
|
||||
),
|
||||
"mtime": datetime.datetime.fromtimestamp(
|
||||
stat.st_mtime, tz=datetime.timezone.utc
|
||||
),
|
||||
}
|
||||
if parent_path:
|
||||
out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
|
||||
return out
|
||||
|
||||
def ls(self, path, detail=False):
|
||||
logger.debug("Listing folder %s", path)
|
||||
stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
|
||||
if detail:
|
||||
return stats
|
||||
else:
|
||||
paths = [stat["name"] for stat in stats]
|
||||
return sorted(paths)
|
||||
|
||||
def put(self, lpath, rpath, callback=None, **kwargs):
|
||||
logger.debug("Put file %s into %s", lpath, rpath)
|
||||
self.ftp.put(lpath, rpath)
|
||||
|
||||
def get_file(self, rpath, lpath, **kwargs):
|
||||
if self.isdir(rpath):
|
||||
os.makedirs(lpath, exist_ok=True)
|
||||
else:
|
||||
self.ftp.get(self._strip_protocol(rpath), lpath)
|
||||
|
||||
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
||||
"""
|
||||
block_size: int or None
|
||||
If 0, no buffering, if 1, line buffering, if >1, buffer that many
|
||||
bytes, if None use default from paramiko.
|
||||
"""
|
||||
logger.debug("Opening file %s", path)
|
||||
if kwargs.get("autocommit", True) is False:
|
||||
# writes to temporary file, move on commit
|
||||
path2 = "/".join([self.temppath, str(uuid.uuid4())])
|
||||
f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
|
||||
f.temppath = path2
|
||||
f.targetpath = path
|
||||
f.fs = self
|
||||
f.commit = types.MethodType(commit_a_file, f)
|
||||
f.discard = types.MethodType(discard_a_file, f)
|
||||
else:
|
||||
f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
|
||||
return f
|
||||
|
||||
def _rm(self, path):
|
||||
if self.isdir(path):
|
||||
self.ftp.rmdir(path)
|
||||
else:
|
||||
self.ftp.remove(path)
|
||||
|
||||
def mv(self, old, new):
|
||||
logger.debug("Renaming %s into %s", old, new)
|
||||
self.ftp.posix_rename(old, new)
|
||||
|
||||
|
||||
def commit_a_file(self):
|
||||
self.fs.mv(self.temppath, self.targetpath)
|
||||
|
||||
|
||||
def discard_a_file(self):
|
||||
self.fs._rm(self.temppath)
|
||||
416
.venv/lib/python3.10/site-packages/fsspec/implementations/smb.py
Normal file
416
.venv/lib/python3.10/site-packages/fsspec/implementations/smb.py
Normal file
@@ -0,0 +1,416 @@
|
||||
"""
|
||||
This module contains SMBFileSystem class responsible for handling access to
|
||||
Windows Samba network shares by using package smbprotocol
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import re
|
||||
import uuid
|
||||
from stat import S_ISDIR, S_ISLNK
|
||||
|
||||
import smbclient
|
||||
import smbprotocol.exceptions
|
||||
|
||||
from .. import AbstractFileSystem
|
||||
from ..utils import infer_storage_options
|
||||
|
||||
# ! pylint: disable=bad-continuation
|
||||
|
||||
|
||||
class SMBFileSystem(AbstractFileSystem):
|
||||
"""Allow reading and writing to Windows and Samba network shares.
|
||||
|
||||
When using `fsspec.open()` for getting a file-like object the URI
|
||||
should be specified as this format:
|
||||
``smb://workgroup;user:password@server:port/share/folder/file.csv``.
|
||||
|
||||
Example::
|
||||
|
||||
>>> import fsspec
|
||||
>>> with fsspec.open(
|
||||
... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
|
||||
... ) as smbfile:
|
||||
... df = pd.read_csv(smbfile, sep='|', header=None)
|
||||
|
||||
Note that you need to pass in a valid hostname or IP address for the host
|
||||
component of the URL. Do not use the Windows/NetBIOS machine name for the
|
||||
host component.
|
||||
|
||||
The first component of the path in the URL points to the name of the shared
|
||||
folder. Subsequent path components will point to the directory/folder/file.
|
||||
|
||||
The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
|
||||
optional.
|
||||
|
||||
.. note::
|
||||
|
||||
For working this source require `smbprotocol`_ to be installed, e.g.::
|
||||
|
||||
$ pip install smbprotocol
|
||||
# or
|
||||
# pip install smbprotocol[kerberos]
|
||||
|
||||
.. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
|
||||
|
||||
Note: if using this with the ``open`` or ``open_files``, with full URLs,
|
||||
there is no way to tell if a path is relative, so all paths are assumed
|
||||
to be absolute.
|
||||
"""
|
||||
|
||||
protocol = "smb"
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def __init__(
|
||||
self,
|
||||
host,
|
||||
port=None,
|
||||
username=None,
|
||||
password=None,
|
||||
timeout=60,
|
||||
encrypt=None,
|
||||
share_access=None,
|
||||
register_session_retries=4,
|
||||
register_session_retry_wait=1,
|
||||
register_session_retry_factor=10,
|
||||
auto_mkdir=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
You can use _get_kwargs_from_urls to get some kwargs from
|
||||
a reasonable SMB url.
|
||||
|
||||
Authentication will be anonymous or integrated if username/password are not
|
||||
given.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
The remote server name/ip to connect to
|
||||
port: int or None
|
||||
Port to connect with. Usually 445, sometimes 139.
|
||||
username: str or None
|
||||
Username to connect with. Required if Kerberos auth is not being used.
|
||||
password: str or None
|
||||
User's password on the server, if using username
|
||||
timeout: int
|
||||
Connection timeout in seconds
|
||||
encrypt: bool
|
||||
Whether to force encryption or not, once this has been set to True
|
||||
the session cannot be changed back to False.
|
||||
share_access: str or None
|
||||
Specifies the default access applied to file open operations
|
||||
performed with this file system object.
|
||||
This affects whether other processes can concurrently open a handle
|
||||
to the same file.
|
||||
|
||||
- None (the default): exclusively locks the file until closed.
|
||||
- 'r': Allow other handles to be opened with read access.
|
||||
- 'w': Allow other handles to be opened with write access.
|
||||
- 'd': Allow other handles to be opened with delete access.
|
||||
register_session_retries: int
|
||||
Number of retries to register a session with the server. Retries are not performed
|
||||
for authentication errors, as they are considered as invalid credentials and not network
|
||||
issues. If set to negative value, no register attempts will be performed.
|
||||
register_session_retry_wait: int
|
||||
Time in seconds to wait between each retry. Number must be non-negative.
|
||||
register_session_retry_factor: int
|
||||
Base factor for the wait time between each retry. The wait time
|
||||
is calculated using exponential function. For factor=1 all wait times
|
||||
will be equal to `register_session_retry_wait`. For any number of retries,
|
||||
the last wait time will be equal to `register_session_retry_wait` and for retries>1
|
||||
the first wait time will be equal to `register_session_retry_wait / factor`.
|
||||
Number must be equal to or greater than 1. Optimal factor is 10.
|
||||
auto_mkdir: bool
|
||||
Whether, when opening a file, the directory containing it should
|
||||
be created (if it doesn't already exist). This is assumed by pyarrow
|
||||
and zarr-python code.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.timeout = timeout
|
||||
self.encrypt = encrypt
|
||||
self.temppath = kwargs.pop("temppath", "")
|
||||
self.share_access = share_access
|
||||
self.register_session_retries = register_session_retries
|
||||
if register_session_retry_wait < 0:
|
||||
raise ValueError(
|
||||
"register_session_retry_wait must be a non-negative integer"
|
||||
)
|
||||
self.register_session_retry_wait = register_session_retry_wait
|
||||
if register_session_retry_factor < 1:
|
||||
raise ValueError(
|
||||
"register_session_retry_factor must be a positive "
|
||||
"integer equal to or greater than 1"
|
||||
)
|
||||
self.register_session_retry_factor = register_session_retry_factor
|
||||
self.auto_mkdir = auto_mkdir
|
||||
self._connect()
|
||||
|
||||
@property
|
||||
def _port(self):
|
||||
return 445 if self.port is None else self.port
|
||||
|
||||
def _connect(self):
|
||||
import time
|
||||
|
||||
if self.register_session_retries <= -1:
|
||||
return
|
||||
|
||||
retried_errors = []
|
||||
|
||||
wait_time = self.register_session_retry_wait
|
||||
n_waits = (
|
||||
self.register_session_retries - 1
|
||||
) # -1 = No wait time after the last retry
|
||||
factor = self.register_session_retry_factor
|
||||
|
||||
# Generate wait times for each retry attempt.
|
||||
# Wait times are calculated using exponential function. For factor=1 all wait times
|
||||
# will be equal to `wait`. For any number of retries the last wait time will be
|
||||
# equal to `wait` and for retries>2 the first wait time will be equal to `wait / factor`.
|
||||
wait_times = iter(
|
||||
factor ** (n / n_waits - 1) * wait_time for n in range(0, n_waits + 1)
|
||||
)
|
||||
|
||||
for attempt in range(self.register_session_retries + 1):
|
||||
try:
|
||||
smbclient.register_session(
|
||||
self.host,
|
||||
username=self.username,
|
||||
password=self.password,
|
||||
port=self._port,
|
||||
encrypt=self.encrypt,
|
||||
connection_timeout=self.timeout,
|
||||
)
|
||||
return
|
||||
except (
|
||||
smbprotocol.exceptions.SMBAuthenticationError,
|
||||
smbprotocol.exceptions.LogonFailure,
|
||||
):
|
||||
# These exceptions should not be repeated, as they clearly indicate
|
||||
# that the credentials are invalid and not a network issue.
|
||||
raise
|
||||
except ValueError as exc:
|
||||
if re.findall(r"\[Errno -\d+]", str(exc)):
|
||||
# This exception is raised by the smbprotocol.transport:Tcp.connect
|
||||
# and originates from socket.gaierror (OSError). These exceptions might
|
||||
# be raised due to network instability. We will retry to connect.
|
||||
retried_errors.append(exc)
|
||||
else:
|
||||
# All another ValueError exceptions should be raised, as they are not
|
||||
# related to network issues.
|
||||
raise
|
||||
except Exception as exc:
|
||||
# Save the exception and retry to connect. This except might be dropped
|
||||
# in the future, once all exceptions suited for retry are identified.
|
||||
retried_errors.append(exc)
|
||||
|
||||
if attempt < self.register_session_retries:
|
||||
time.sleep(next(wait_times))
|
||||
|
||||
# Raise last exception to inform user about the connection issues.
|
||||
# Note: Should we use ExceptionGroup to raise all exceptions?
|
||||
raise retried_errors[-1]
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return infer_storage_options(path)["path"]
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(path):
|
||||
# smb://workgroup;user:password@host:port/share/folder/file.csv
|
||||
out = infer_storage_options(path)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
return out
|
||||
|
||||
def mkdir(self, path, create_parents=True, **kwargs):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
if create_parents:
|
||||
smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
|
||||
else:
|
||||
smbclient.mkdir(wpath, port=self._port, **kwargs)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
if _share_has_path(path):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
|
||||
|
||||
def rmdir(self, path):
|
||||
if _share_has_path(path):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
smbclient.rmdir(wpath, port=self._port)
|
||||
|
||||
def info(self, path, **kwargs):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port, **kwargs)
|
||||
if S_ISDIR(stats.st_mode):
|
||||
stype = "directory"
|
||||
elif S_ISLNK(stats.st_mode):
|
||||
stype = "link"
|
||||
else:
|
||||
stype = "file"
|
||||
res = {
|
||||
"name": path + "/" if stype == "directory" else path,
|
||||
"size": stats.st_size,
|
||||
"type": stype,
|
||||
"uid": stats.st_uid,
|
||||
"gid": stats.st_gid,
|
||||
"time": stats.st_atime,
|
||||
"mtime": stats.st_mtime,
|
||||
}
|
||||
return res
|
||||
|
||||
def created(self, path):
|
||||
"""Return the created timestamp of a file as a datetime.datetime"""
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port)
|
||||
return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
|
||||
|
||||
def modified(self, path):
|
||||
"""Return the modified timestamp of a file as a datetime.datetime"""
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port)
|
||||
return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
|
||||
|
||||
def ls(self, path, detail=True, **kwargs):
|
||||
unc = _as_unc_path(self.host, path)
|
||||
listed = smbclient.listdir(unc, port=self._port, **kwargs)
|
||||
dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
|
||||
if detail:
|
||||
dirs = [self.info(d) for d in dirs]
|
||||
return dirs
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=-1,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
block_size: int or None
|
||||
If 0, no buffering, 1, line buffering, >1, buffer that many bytes
|
||||
|
||||
Notes
|
||||
-----
|
||||
By specifying 'share_access' in 'kwargs' it is possible to override the
|
||||
default shared access setting applied in the constructor of this object.
|
||||
"""
|
||||
if self.auto_mkdir and "w" in mode:
|
||||
self.makedirs(self._parent(path), exist_ok=True)
|
||||
bls = block_size if block_size is not None and block_size >= 0 else -1
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
share_access = kwargs.pop("share_access", self.share_access)
|
||||
if "w" in mode and autocommit is False:
|
||||
temp = _as_temp_path(self.host, path, self.temppath)
|
||||
return SMBFileOpener(
|
||||
wpath, temp, mode, port=self._port, block_size=bls, **kwargs
|
||||
)
|
||||
return smbclient.open_file(
|
||||
wpath,
|
||||
mode,
|
||||
buffering=bls,
|
||||
share_access=share_access,
|
||||
port=self._port,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def copy(self, path1, path2, **kwargs):
|
||||
"""Copy within two locations in the same filesystem"""
|
||||
wpath1 = _as_unc_path(self.host, path1)
|
||||
wpath2 = _as_unc_path(self.host, path2)
|
||||
if self.auto_mkdir:
|
||||
self.makedirs(self._parent(path2), exist_ok=True)
|
||||
smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
|
||||
|
||||
def _rm(self, path):
|
||||
if _share_has_path(path):
|
||||
wpath = _as_unc_path(self.host, path)
|
||||
stats = smbclient.stat(wpath, port=self._port)
|
||||
if S_ISDIR(stats.st_mode):
|
||||
smbclient.rmdir(wpath, port=self._port)
|
||||
else:
|
||||
smbclient.remove(wpath, port=self._port)
|
||||
|
||||
def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
|
||||
wpath1 = _as_unc_path(self.host, path1)
|
||||
wpath2 = _as_unc_path(self.host, path2)
|
||||
smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
|
||||
|
||||
|
||||
def _as_unc_path(host, path):
|
||||
rpath = path.replace("/", "\\")
|
||||
unc = f"\\\\{host}{rpath}"
|
||||
return unc
|
||||
|
||||
|
||||
def _as_temp_path(host, path, temppath):
|
||||
share = path.split("/")[1]
|
||||
temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
|
||||
unc = _as_unc_path(host, temp_file)
|
||||
return unc
|
||||
|
||||
|
||||
def _share_has_path(path):
|
||||
parts = path.count("/")
|
||||
if path.endswith("/"):
|
||||
return parts > 2
|
||||
return parts > 1
|
||||
|
||||
|
||||
class SMBFileOpener:
|
||||
"""writes to remote temporary file, move on commit"""
|
||||
|
||||
def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
|
||||
self.path = path
|
||||
self.temp = temp
|
||||
self.mode = mode
|
||||
self.block_size = block_size
|
||||
self.kwargs = kwargs
|
||||
self.smbfile = None
|
||||
self._incontext = False
|
||||
self.port = port
|
||||
self._open()
|
||||
|
||||
def _open(self):
|
||||
if self.smbfile is None or self.smbfile.closed:
|
||||
self.smbfile = smbclient.open_file(
|
||||
self.temp,
|
||||
self.mode,
|
||||
port=self.port,
|
||||
buffering=self.block_size,
|
||||
**self.kwargs,
|
||||
)
|
||||
|
||||
def commit(self):
|
||||
"""Move temp file to definitive on success."""
|
||||
# TODO: use transaction support in SMB protocol
|
||||
smbclient.replace(self.temp, self.path, port=self.port)
|
||||
|
||||
def discard(self):
|
||||
"""Remove the temp file on failure."""
|
||||
smbclient.remove(self.temp, port=self.port)
|
||||
|
||||
def __fspath__(self):
|
||||
return self.path
|
||||
|
||||
def __iter__(self):
|
||||
return self.smbfile.__iter__()
|
||||
|
||||
def __getattr__(self, item):
|
||||
return getattr(self.smbfile, item)
|
||||
|
||||
def __enter__(self):
|
||||
self._incontext = True
|
||||
return self.smbfile.__enter__()
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self._incontext = False
|
||||
self.smbfile.__exit__(exc_type, exc_value, traceback)
|
||||
124
.venv/lib/python3.10/site-packages/fsspec/implementations/tar.py
Normal file
124
.venv/lib/python3.10/site-packages/fsspec/implementations/tar.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import logging
|
||||
import tarfile
|
||||
|
||||
import fsspec
|
||||
from fsspec.archive import AbstractArchiveFileSystem
|
||||
from fsspec.compression import compr
|
||||
from fsspec.utils import infer_compression
|
||||
|
||||
typemap = {b"0": "file", b"5": "directory"}
|
||||
|
||||
logger = logging.getLogger("tar")
|
||||
|
||||
|
||||
class TarFileSystem(AbstractArchiveFileSystem):
|
||||
"""Compressed Tar archives as a file-system (read-only)
|
||||
|
||||
Supports the following formats:
|
||||
tar.gz, tar.bz2, tar.xz
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
protocol = "tar"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fo="",
|
||||
index_store=None,
|
||||
target_options=None,
|
||||
target_protocol=None,
|
||||
compression=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
target_options = target_options or {}
|
||||
|
||||
if isinstance(fo, str):
|
||||
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
|
||||
fo = self.of.open() # keep the reference
|
||||
|
||||
# Try to infer compression.
|
||||
if compression is None:
|
||||
name = None
|
||||
|
||||
# Try different ways to get hold of the filename. `fo` might either
|
||||
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
|
||||
# `fsspec.AbstractFileSystem` instance.
|
||||
try:
|
||||
# Amended io.BufferedReader or similar.
|
||||
# This uses a "protocol extension" where original filenames are
|
||||
# propagated to archive-like filesystems in order to let them
|
||||
# infer the right compression appropriately.
|
||||
if hasattr(fo, "original"):
|
||||
name = fo.original
|
||||
|
||||
# fsspec.LocalFileOpener
|
||||
elif hasattr(fo, "path"):
|
||||
name = fo.path
|
||||
|
||||
# io.BufferedReader
|
||||
elif hasattr(fo, "name"):
|
||||
name = fo.name
|
||||
|
||||
# fsspec.AbstractFileSystem
|
||||
elif hasattr(fo, "info"):
|
||||
name = fo.info()["name"]
|
||||
|
||||
except Exception as ex:
|
||||
logger.warning(
|
||||
f"Unable to determine file name, not inferring compression: {ex}"
|
||||
)
|
||||
|
||||
if name is not None:
|
||||
compression = infer_compression(name)
|
||||
logger.info(f"Inferred compression {compression} from file name {name}")
|
||||
|
||||
if compression is not None:
|
||||
# TODO: tarfile already implements compression with modes like "'r:gz'",
|
||||
# but then would seek to offset in the file work?
|
||||
fo = compr[compression](fo)
|
||||
|
||||
self._fo_ref = fo
|
||||
self.fo = fo # the whole instance is a context
|
||||
self.tar = tarfile.TarFile(fileobj=self.fo)
|
||||
self.dir_cache = None
|
||||
|
||||
self.index_store = index_store
|
||||
self.index = None
|
||||
self._index()
|
||||
|
||||
def _index(self):
|
||||
# TODO: load and set saved index, if exists
|
||||
out = {}
|
||||
for ti in self.tar:
|
||||
info = ti.get_info()
|
||||
info["type"] = typemap.get(info["type"], "file")
|
||||
name = ti.get_info()["name"].rstrip("/")
|
||||
out[name] = (info, ti.offset_data)
|
||||
|
||||
self.index = out
|
||||
# TODO: save index to self.index_store here, if set
|
||||
|
||||
def _get_dirs(self):
|
||||
if self.dir_cache is not None:
|
||||
return
|
||||
|
||||
# This enables ls to get directories as children as well as files
|
||||
self.dir_cache = {
|
||||
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||||
for dirname in self._all_dirnames(self.tar.getnames())
|
||||
}
|
||||
for member in self.tar.getmembers():
|
||||
info = member.get_info()
|
||||
info["name"] = info["name"].rstrip("/")
|
||||
info["type"] = typemap.get(info["type"], "file")
|
||||
self.dir_cache[info["name"]] = info
|
||||
|
||||
def _open(self, path, mode="rb", **kwargs):
|
||||
if mode != "rb":
|
||||
raise ValueError("Read-only filesystem implementation")
|
||||
details, offset = self.index[path]
|
||||
if details["type"] != "file":
|
||||
raise ValueError("Can only handle regular files")
|
||||
return self.tar.extractfile(path)
|
||||
@@ -0,0 +1,485 @@
|
||||
# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
|
||||
|
||||
import logging
|
||||
import os
|
||||
import secrets
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from contextlib import suppress
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
|
||||
from ..spec import AbstractBufferedFile, AbstractFileSystem
|
||||
from ..utils import infer_storage_options, tokenize
|
||||
|
||||
logger = logging.getLogger("webhdfs")
|
||||
|
||||
|
||||
class WebHDFS(AbstractFileSystem):
|
||||
"""
|
||||
Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
|
||||
|
||||
Four auth mechanisms are supported:
|
||||
|
||||
insecure: no auth is done, and the user is assumed to be whoever they
|
||||
say they are (parameter ``user``), or a predefined value such as
|
||||
"dr.who" if not given
|
||||
spnego: when kerberos authentication is enabled, auth is negotiated by
|
||||
requests_kerberos https://github.com/requests/requests-kerberos .
|
||||
This establishes a session based on existing kinit login and/or
|
||||
specified principal/password; parameters are passed with ``kerb_kwargs``
|
||||
token: uses an existing Hadoop delegation token from another secured
|
||||
service. Indeed, this client can also generate such tokens when
|
||||
not insecure. Note that tokens expire, but can be renewed (by a
|
||||
previously specified user) and may allow for proxying.
|
||||
basic-auth: used when both parameter ``user`` and parameter ``password``
|
||||
are provided.
|
||||
|
||||
"""
|
||||
|
||||
tempdir = str(tempfile.gettempdir())
|
||||
protocol = "webhdfs", "webHDFS"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host,
|
||||
port=50070,
|
||||
kerberos=False,
|
||||
token=None,
|
||||
user=None,
|
||||
password=None,
|
||||
proxy_to=None,
|
||||
kerb_kwargs=None,
|
||||
data_proxy=None,
|
||||
use_https=False,
|
||||
session_cert=None,
|
||||
session_verify=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
host: str
|
||||
Name-node address
|
||||
port: int
|
||||
Port for webHDFS
|
||||
kerberos: bool
|
||||
Whether to authenticate with kerberos for this connection
|
||||
token: str or None
|
||||
If given, use this token on every call to authenticate. A user
|
||||
and user-proxy may be encoded in the token and should not be also
|
||||
given
|
||||
user: str or None
|
||||
If given, assert the user name to connect with
|
||||
password: str or None
|
||||
If given, assert the password to use for basic auth. If password
|
||||
is provided, user must be provided also
|
||||
proxy_to: str or None
|
||||
If given, the user has the authority to proxy, and this value is
|
||||
the user in who's name actions are taken
|
||||
kerb_kwargs: dict
|
||||
Any extra arguments for HTTPKerberosAuth, see
|
||||
`<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
|
||||
data_proxy: dict, callable or None
|
||||
If given, map data-node addresses. This can be necessary if the
|
||||
HDFS cluster is behind a proxy, running on Docker or otherwise has
|
||||
a mismatch between the host-names given by the name-node and the
|
||||
address by which to refer to them from the client. If a dict,
|
||||
maps host names ``host->data_proxy[host]``; if a callable, full
|
||||
URLs are passed, and function must conform to
|
||||
``url->data_proxy(url)``.
|
||||
use_https: bool
|
||||
Whether to connect to the Name-node using HTTPS instead of HTTP
|
||||
session_cert: str or Tuple[str, str] or None
|
||||
Path to a certificate file, or tuple of (cert, key) files to use
|
||||
for the requests.Session
|
||||
session_verify: str, bool or None
|
||||
Path to a certificate file to use for verifying the requests.Session.
|
||||
kwargs
|
||||
"""
|
||||
if self._cached:
|
||||
return
|
||||
super().__init__(**kwargs)
|
||||
self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"
|
||||
self.kerb = kerberos
|
||||
self.kerb_kwargs = kerb_kwargs or {}
|
||||
self.pars = {}
|
||||
self.proxy = data_proxy or {}
|
||||
if token is not None:
|
||||
if user is not None or proxy_to is not None:
|
||||
raise ValueError(
|
||||
"If passing a delegation token, must not set "
|
||||
"user or proxy_to, as these are encoded in the"
|
||||
" token"
|
||||
)
|
||||
self.pars["delegation"] = token
|
||||
self.user = user
|
||||
self.password = password
|
||||
|
||||
if password is not None:
|
||||
if user is None:
|
||||
raise ValueError(
|
||||
"If passing a password, the user must also be"
|
||||
"set in order to set up the basic-auth"
|
||||
)
|
||||
else:
|
||||
if user is not None:
|
||||
self.pars["user.name"] = user
|
||||
|
||||
if proxy_to is not None:
|
||||
self.pars["doas"] = proxy_to
|
||||
if kerberos and user is not None:
|
||||
raise ValueError(
|
||||
"If using Kerberos auth, do not specify the "
|
||||
"user, this is handled by kinit."
|
||||
)
|
||||
|
||||
self.session_cert = session_cert
|
||||
self.session_verify = session_verify
|
||||
|
||||
self._connect()
|
||||
|
||||
self._fsid = f"webhdfs_{tokenize(host, port)}"
|
||||
|
||||
@property
|
||||
def fsid(self):
|
||||
return self._fsid
|
||||
|
||||
def _connect(self):
|
||||
self.session = requests.Session()
|
||||
|
||||
if self.session_cert:
|
||||
self.session.cert = self.session_cert
|
||||
|
||||
self.session.verify = self.session_verify
|
||||
|
||||
if self.kerb:
|
||||
from requests_kerberos import HTTPKerberosAuth
|
||||
|
||||
self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
|
||||
|
||||
if self.user is not None and self.password is not None:
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
self.session.auth = HTTPBasicAuth(self.user, self.password)
|
||||
|
||||
def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
|
||||
path = self._strip_protocol(path) if path is not None else ""
|
||||
url = self._apply_proxy(self.url + quote(path, safe="/="))
|
||||
args = kwargs.copy()
|
||||
args.update(self.pars)
|
||||
args["op"] = op.upper()
|
||||
logger.debug("sending %s with %s", url, method)
|
||||
out = self.session.request(
|
||||
method=method.upper(),
|
||||
url=url,
|
||||
params=args,
|
||||
data=data,
|
||||
allow_redirects=redirect,
|
||||
)
|
||||
if out.status_code in [400, 401, 403, 404, 500]:
|
||||
try:
|
||||
err = out.json()
|
||||
msg = err["RemoteException"]["message"]
|
||||
exp = err["RemoteException"]["exception"]
|
||||
except (ValueError, KeyError):
|
||||
pass
|
||||
else:
|
||||
if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
|
||||
raise ValueError(msg)
|
||||
elif exp in ["SecurityException", "AccessControlException"]:
|
||||
raise PermissionError(msg)
|
||||
elif exp in ["FileNotFoundException"]:
|
||||
raise FileNotFoundError(msg)
|
||||
else:
|
||||
raise RuntimeError(msg)
|
||||
out.raise_for_status()
|
||||
return out
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
replication=None,
|
||||
permissions=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
File location
|
||||
mode: str
|
||||
'rb', 'wb', etc.
|
||||
block_size: int
|
||||
Client buffer size for read-ahead or write buffer
|
||||
autocommit: bool
|
||||
If False, writes to temporary file that only gets put in final
|
||||
location upon commit
|
||||
replication: int
|
||||
Number of copies of file on the cluster, write mode only
|
||||
permissions: str or int
|
||||
posix permissions, write mode only
|
||||
kwargs
|
||||
|
||||
Returns
|
||||
-------
|
||||
WebHDFile instance
|
||||
"""
|
||||
block_size = block_size or self.blocksize
|
||||
return WebHDFile(
|
||||
self,
|
||||
path,
|
||||
mode=mode,
|
||||
block_size=block_size,
|
||||
tempdir=self.tempdir,
|
||||
autocommit=autocommit,
|
||||
replication=replication,
|
||||
permissions=permissions,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _process_info(info):
|
||||
info["type"] = info["type"].lower()
|
||||
info["size"] = info["length"]
|
||||
return info
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
return infer_storage_options(path)["path"]
|
||||
|
||||
@staticmethod
|
||||
def _get_kwargs_from_urls(urlpath):
|
||||
out = infer_storage_options(urlpath)
|
||||
out.pop("path", None)
|
||||
out.pop("protocol", None)
|
||||
if "username" in out:
|
||||
out["user"] = out.pop("username")
|
||||
return out
|
||||
|
||||
def info(self, path):
|
||||
out = self._call("GETFILESTATUS", path=path)
|
||||
info = out.json()["FileStatus"]
|
||||
info["name"] = path
|
||||
return self._process_info(info)
|
||||
|
||||
def ls(self, path, detail=False):
|
||||
out = self._call("LISTSTATUS", path=path)
|
||||
infos = out.json()["FileStatuses"]["FileStatus"]
|
||||
for info in infos:
|
||||
self._process_info(info)
|
||||
info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
|
||||
if detail:
|
||||
return sorted(infos, key=lambda i: i["name"])
|
||||
else:
|
||||
return sorted(info["name"] for info in infos)
|
||||
|
||||
def content_summary(self, path):
|
||||
"""Total numbers of files, directories and bytes under path"""
|
||||
out = self._call("GETCONTENTSUMMARY", path=path)
|
||||
return out.json()["ContentSummary"]
|
||||
|
||||
def ukey(self, path):
|
||||
"""Checksum info of file, giving method and result"""
|
||||
out = self._call("GETFILECHECKSUM", path=path, redirect=False)
|
||||
if "Location" in out.headers:
|
||||
location = self._apply_proxy(out.headers["Location"])
|
||||
out2 = self.session.get(location)
|
||||
out2.raise_for_status()
|
||||
return out2.json()["FileChecksum"]
|
||||
else:
|
||||
out.raise_for_status()
|
||||
return out.json()["FileChecksum"]
|
||||
|
||||
def home_directory(self):
|
||||
"""Get user's home directory"""
|
||||
out = self._call("GETHOMEDIRECTORY")
|
||||
return out.json()["Path"]
|
||||
|
||||
def get_delegation_token(self, renewer=None):
|
||||
"""Retrieve token which can give the same authority to other uses
|
||||
|
||||
Parameters
|
||||
----------
|
||||
renewer: str or None
|
||||
User who may use this token; if None, will be current user
|
||||
"""
|
||||
if renewer:
|
||||
out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
|
||||
else:
|
||||
out = self._call("GETDELEGATIONTOKEN")
|
||||
t = out.json()["Token"]
|
||||
if t is None:
|
||||
raise ValueError("No token available for this user/security context")
|
||||
return t["urlString"]
|
||||
|
||||
def renew_delegation_token(self, token):
|
||||
"""Make token live longer. Returns new expiry time"""
|
||||
out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
|
||||
return out.json()["long"]
|
||||
|
||||
def cancel_delegation_token(self, token):
|
||||
"""Stop the token from being useful"""
|
||||
self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
|
||||
|
||||
def chmod(self, path, mod):
|
||||
"""Set the permission at path
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
location to set (file or directory)
|
||||
mod: str or int
|
||||
posix epresentation or permission, give as oct string, e.g, '777'
|
||||
or 0o777
|
||||
"""
|
||||
self._call("SETPERMISSION", method="put", path=path, permission=mod)
|
||||
|
||||
def chown(self, path, owner=None, group=None):
|
||||
"""Change owning user and/or group"""
|
||||
kwargs = {}
|
||||
if owner is not None:
|
||||
kwargs["owner"] = owner
|
||||
if group is not None:
|
||||
kwargs["group"] = group
|
||||
self._call("SETOWNER", method="put", path=path, **kwargs)
|
||||
|
||||
def set_replication(self, path, replication):
|
||||
"""
|
||||
Set file replication factor
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
File location (not for directories)
|
||||
replication: int
|
||||
Number of copies of file on the cluster. Should be smaller than
|
||||
number of data nodes; normally 3 on most systems.
|
||||
"""
|
||||
self._call("SETREPLICATION", path=path, method="put", replication=replication)
|
||||
|
||||
def mkdir(self, path, **kwargs):
|
||||
self._call("MKDIRS", method="put", path=path)
|
||||
|
||||
def makedirs(self, path, exist_ok=False):
|
||||
if exist_ok is False and self.exists(path):
|
||||
raise FileExistsError(path)
|
||||
self.mkdir(path)
|
||||
|
||||
def mv(self, path1, path2, **kwargs):
|
||||
self._call("RENAME", method="put", path=path1, destination=path2)
|
||||
|
||||
def rm(self, path, recursive=False, **kwargs):
|
||||
self._call(
|
||||
"DELETE",
|
||||
method="delete",
|
||||
path=path,
|
||||
recursive="true" if recursive else "false",
|
||||
)
|
||||
|
||||
def rm_file(self, path, **kwargs):
|
||||
self.rm(path)
|
||||
|
||||
def cp_file(self, lpath, rpath, **kwargs):
|
||||
with self.open(lpath) as lstream:
|
||||
tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
|
||||
# Perform an atomic copy (stream to a temporary file and
|
||||
# move it to the actual destination).
|
||||
try:
|
||||
with self.open(tmp_fname, "wb") as rstream:
|
||||
shutil.copyfileobj(lstream, rstream)
|
||||
self.mv(tmp_fname, rpath)
|
||||
except BaseException:
|
||||
with suppress(FileNotFoundError):
|
||||
self.rm(tmp_fname)
|
||||
raise
|
||||
|
||||
def _apply_proxy(self, location):
|
||||
if self.proxy and callable(self.proxy):
|
||||
location = self.proxy(location)
|
||||
elif self.proxy:
|
||||
# as a dict
|
||||
for k, v in self.proxy.items():
|
||||
location = location.replace(k, v, 1)
|
||||
return location
|
||||
|
||||
|
||||
class WebHDFile(AbstractBufferedFile):
|
||||
"""A file living in HDFS over webHDFS"""
|
||||
|
||||
def __init__(self, fs, path, **kwargs):
|
||||
super().__init__(fs, path, **kwargs)
|
||||
kwargs = kwargs.copy()
|
||||
if kwargs.get("permissions", None) is None:
|
||||
kwargs.pop("permissions", None)
|
||||
if kwargs.get("replication", None) is None:
|
||||
kwargs.pop("replication", None)
|
||||
self.permissions = kwargs.pop("permissions", 511)
|
||||
tempdir = kwargs.pop("tempdir")
|
||||
if kwargs.pop("autocommit", False) is False:
|
||||
self.target = self.path
|
||||
self.path = os.path.join(tempdir, str(uuid.uuid4()))
|
||||
|
||||
def _upload_chunk(self, final=False):
|
||||
"""Write one part of a multi-block file upload
|
||||
|
||||
Parameters
|
||||
==========
|
||||
final: bool
|
||||
This is the last block, so should complete file, if
|
||||
self.autocommit is True.
|
||||
"""
|
||||
out = self.fs.session.post(
|
||||
self.location,
|
||||
data=self.buffer.getvalue(),
|
||||
headers={"content-type": "application/octet-stream"},
|
||||
)
|
||||
out.raise_for_status()
|
||||
return True
|
||||
|
||||
def _initiate_upload(self):
|
||||
"""Create remote file/upload"""
|
||||
kwargs = self.kwargs.copy()
|
||||
if "a" in self.mode:
|
||||
op, method = "APPEND", "POST"
|
||||
else:
|
||||
op, method = "CREATE", "PUT"
|
||||
kwargs["overwrite"] = "true"
|
||||
out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
|
||||
location = self.fs._apply_proxy(out.headers["Location"])
|
||||
if "w" in self.mode:
|
||||
# create empty file to append to
|
||||
out2 = self.fs.session.put(
|
||||
location, headers={"content-type": "application/octet-stream"}
|
||||
)
|
||||
out2.raise_for_status()
|
||||
# after creating empty file, change location to append to
|
||||
out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
|
||||
self.location = self.fs._apply_proxy(out2.headers["Location"])
|
||||
|
||||
def _fetch_range(self, start, end):
|
||||
start = max(start, 0)
|
||||
end = min(self.size, end)
|
||||
if start >= end or start >= self.size:
|
||||
return b""
|
||||
out = self.fs._call(
|
||||
"OPEN", path=self.path, offset=start, length=end - start, redirect=False
|
||||
)
|
||||
out.raise_for_status()
|
||||
if "Location" in out.headers:
|
||||
location = out.headers["Location"]
|
||||
out2 = self.fs.session.get(self.fs._apply_proxy(location))
|
||||
return out2.content
|
||||
else:
|
||||
return out.content
|
||||
|
||||
def commit(self):
|
||||
self.fs.mv(self.path, self.target)
|
||||
|
||||
def discard(self):
|
||||
self.fs.rm(self.path)
|
||||
177
.venv/lib/python3.10/site-packages/fsspec/implementations/zip.py
Normal file
177
.venv/lib/python3.10/site-packages/fsspec/implementations/zip.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import fsspec
|
||||
from fsspec.archive import AbstractArchiveFileSystem
|
||||
|
||||
|
||||
class ZipFileSystem(AbstractArchiveFileSystem):
|
||||
"""Read/Write contents of ZIP archive as a file-system
|
||||
|
||||
Keeps file object open while instance lives.
|
||||
|
||||
This class is pickleable, but not necessarily thread-safe
|
||||
"""
|
||||
|
||||
root_marker = ""
|
||||
protocol = "zip"
|
||||
cachable = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fo="",
|
||||
mode="r",
|
||||
target_protocol=None,
|
||||
target_options=None,
|
||||
compression=zipfile.ZIP_STORED,
|
||||
allowZip64=True,
|
||||
compresslevel=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
fo: str or file-like
|
||||
Contains ZIP, and must exist. If a str, will fetch file using
|
||||
:meth:`~fsspec.open_files`, which must return one file exactly.
|
||||
mode: str
|
||||
Accept: "r", "w", "a"
|
||||
target_protocol: str (optional)
|
||||
If ``fo`` is a string, this value can be used to override the
|
||||
FS protocol inferred from a URL
|
||||
target_options: dict (optional)
|
||||
Kwargs passed when instantiating the target FS, if ``fo`` is
|
||||
a string.
|
||||
compression, allowZip64, compresslevel: passed to ZipFile
|
||||
Only relevant when creating a ZIP
|
||||
"""
|
||||
super().__init__(self, **kwargs)
|
||||
if mode not in set("rwa"):
|
||||
raise ValueError(f"mode '{mode}' no understood")
|
||||
self.mode = mode
|
||||
if isinstance(fo, (str, os.PathLike)):
|
||||
if mode == "a":
|
||||
m = "r+b"
|
||||
else:
|
||||
m = mode + "b"
|
||||
fo = fsspec.open(
|
||||
fo, mode=m, protocol=target_protocol, **(target_options or {})
|
||||
)
|
||||
self.force_zip_64 = allowZip64
|
||||
self.of = fo
|
||||
self.fo = fo.__enter__() # the whole instance is a context
|
||||
self.zip = zipfile.ZipFile(
|
||||
self.fo,
|
||||
mode=mode,
|
||||
compression=compression,
|
||||
allowZip64=allowZip64,
|
||||
compresslevel=compresslevel,
|
||||
)
|
||||
self.dir_cache = None
|
||||
|
||||
@classmethod
|
||||
def _strip_protocol(cls, path):
|
||||
# zip file paths are always relative to the archive root
|
||||
return super()._strip_protocol(path).lstrip("/")
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, "zip"):
|
||||
self.close()
|
||||
del self.zip
|
||||
|
||||
def close(self):
|
||||
"""Commits any write changes to the file. Done on ``del`` too."""
|
||||
self.zip.close()
|
||||
|
||||
def _get_dirs(self):
|
||||
if self.dir_cache is None or self.mode in set("wa"):
|
||||
# when writing, dir_cache is always in the ZipFile's attributes,
|
||||
# not read from the file.
|
||||
files = self.zip.infolist()
|
||||
self.dir_cache = {
|
||||
dirname.rstrip("/"): {
|
||||
"name": dirname.rstrip("/"),
|
||||
"size": 0,
|
||||
"type": "directory",
|
||||
}
|
||||
for dirname in self._all_dirnames(self.zip.namelist())
|
||||
}
|
||||
for z in files:
|
||||
f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
|
||||
f.update(
|
||||
{
|
||||
"name": z.filename.rstrip("/"),
|
||||
"size": z.file_size,
|
||||
"type": ("directory" if z.is_dir() else "file"),
|
||||
}
|
||||
)
|
||||
self.dir_cache[f["name"]] = f
|
||||
|
||||
def pipe_file(self, path, value, **kwargs):
|
||||
# override upstream, because we know the exact file size in this case
|
||||
self.zip.writestr(path, value, **kwargs)
|
||||
|
||||
def _open(
|
||||
self,
|
||||
path,
|
||||
mode="rb",
|
||||
block_size=None,
|
||||
autocommit=True,
|
||||
cache_options=None,
|
||||
**kwargs,
|
||||
):
|
||||
path = self._strip_protocol(path)
|
||||
if "r" in mode and self.mode in set("wa"):
|
||||
if self.exists(path):
|
||||
raise OSError("ZipFS can only be open for reading or writing, not both")
|
||||
raise FileNotFoundError(path)
|
||||
if "r" in self.mode and "w" in mode:
|
||||
raise OSError("ZipFS can only be open for reading or writing, not both")
|
||||
out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
|
||||
if "r" in mode:
|
||||
info = self.info(path)
|
||||
out.size = info["size"]
|
||||
out.name = info["name"]
|
||||
return out
|
||||
|
||||
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
|
||||
if maxdepth is not None and maxdepth < 1:
|
||||
raise ValueError("maxdepth must be at least 1")
|
||||
|
||||
# Remove the leading slash, as the zip file paths are always
|
||||
# given without a leading slash
|
||||
path = path.lstrip("/")
|
||||
path_parts = list(filter(lambda s: bool(s), path.split("/")))
|
||||
|
||||
def _matching_starts(file_path):
|
||||
file_parts = filter(lambda s: bool(s), file_path.split("/"))
|
||||
return all(a == b for a, b in zip(path_parts, file_parts))
|
||||
|
||||
self._get_dirs()
|
||||
|
||||
result = {}
|
||||
# To match posix find, if an exact file name is given, we should
|
||||
# return only that file
|
||||
if path in self.dir_cache and self.dir_cache[path]["type"] == "file":
|
||||
result[path] = self.dir_cache[path]
|
||||
return result if detail else [path]
|
||||
|
||||
for file_path, file_info in self.dir_cache.items():
|
||||
if not (path == "" or _matching_starts(file_path)):
|
||||
continue
|
||||
|
||||
if file_info["type"] == "directory":
|
||||
if withdirs:
|
||||
if file_path not in result:
|
||||
result[file_path.strip("/")] = file_info
|
||||
continue
|
||||
|
||||
if file_path not in result:
|
||||
result[file_path] = file_info if detail else None
|
||||
|
||||
if maxdepth:
|
||||
path_depth = path.count("/")
|
||||
result = {
|
||||
k: v for k, v in result.items() if k.count("/") - path_depth < maxdepth
|
||||
}
|
||||
return result if detail else sorted(result)
|
||||
121
.venv/lib/python3.10/site-packages/fsspec/json.py
Normal file
121
.venv/lib/python3.10/site-packages/fsspec/json.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import json
|
||||
from contextlib import suppress
|
||||
from pathlib import PurePath
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
ClassVar,
|
||||
Dict,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
from .registry import _import_class, get_filesystem_class
|
||||
from .spec import AbstractFileSystem
|
||||
|
||||
|
||||
class FilesystemJSONEncoder(json.JSONEncoder):
|
||||
include_password: ClassVar[bool] = True
|
||||
|
||||
def default(self, o: Any) -> Any:
|
||||
if isinstance(o, AbstractFileSystem):
|
||||
return o.to_dict(include_password=self.include_password)
|
||||
if isinstance(o, PurePath):
|
||||
cls = type(o)
|
||||
return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
|
||||
|
||||
return super().default(o)
|
||||
|
||||
def make_serializable(self, obj: Any) -> Any:
|
||||
"""
|
||||
Recursively converts an object so that it can be JSON serialized via
|
||||
:func:`json.dumps` and :func:`json.dump`, without actually calling
|
||||
said functions.
|
||||
"""
|
||||
if isinstance(obj, (str, int, float, bool)):
|
||||
return obj
|
||||
if isinstance(obj, Mapping):
|
||||
return {k: self.make_serializable(v) for k, v in obj.items()}
|
||||
if isinstance(obj, Sequence):
|
||||
return [self.make_serializable(v) for v in obj]
|
||||
|
||||
return self.default(obj)
|
||||
|
||||
|
||||
class FilesystemJSONDecoder(json.JSONDecoder):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
object_hook: Optional[Callable[[Dict[str, Any]], Any]] = None,
|
||||
parse_float: Optional[Callable[[str], Any]] = None,
|
||||
parse_int: Optional[Callable[[str], Any]] = None,
|
||||
parse_constant: Optional[Callable[[str], Any]] = None,
|
||||
strict: bool = True,
|
||||
object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]] = None,
|
||||
) -> None:
|
||||
self.original_object_hook = object_hook
|
||||
|
||||
super().__init__(
|
||||
object_hook=self.custom_object_hook,
|
||||
parse_float=parse_float,
|
||||
parse_int=parse_int,
|
||||
parse_constant=parse_constant,
|
||||
strict=strict,
|
||||
object_pairs_hook=object_pairs_hook,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def try_resolve_path_cls(cls, dct: Dict[str, Any]):
|
||||
with suppress(Exception):
|
||||
fqp = dct["cls"]
|
||||
|
||||
path_cls = _import_class(fqp)
|
||||
|
||||
if issubclass(path_cls, PurePath):
|
||||
return path_cls
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def try_resolve_fs_cls(cls, dct: Dict[str, Any]):
|
||||
with suppress(Exception):
|
||||
if "cls" in dct:
|
||||
try:
|
||||
fs_cls = _import_class(dct["cls"])
|
||||
if issubclass(fs_cls, AbstractFileSystem):
|
||||
return fs_cls
|
||||
except Exception:
|
||||
if "protocol" in dct: # Fallback if cls cannot be imported
|
||||
return get_filesystem_class(dct["protocol"])
|
||||
|
||||
raise
|
||||
|
||||
return None
|
||||
|
||||
def custom_object_hook(self, dct: Dict[str, Any]):
|
||||
if "cls" in dct:
|
||||
if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
|
||||
return AbstractFileSystem.from_dict(dct)
|
||||
if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
|
||||
return obj_cls(dct["str"])
|
||||
|
||||
if self.original_object_hook is not None:
|
||||
return self.original_object_hook(dct)
|
||||
|
||||
return dct
|
||||
|
||||
def unmake_serializable(self, obj: Any) -> Any:
|
||||
"""
|
||||
Inverse function of :meth:`FilesystemJSONEncoder.make_serializable`.
|
||||
"""
|
||||
if isinstance(obj, dict):
|
||||
obj = self.custom_object_hook(obj)
|
||||
if isinstance(obj, dict):
|
||||
return {k: self.unmake_serializable(v) for k, v in obj.items()}
|
||||
if isinstance(obj, (list, tuple)):
|
||||
return [self.unmake_serializable(v) for v in obj]
|
||||
|
||||
return obj
|
||||
251
.venv/lib/python3.10/site-packages/fsspec/mapping.py
Normal file
251
.venv/lib/python3.10/site-packages/fsspec/mapping.py
Normal file
@@ -0,0 +1,251 @@
|
||||
import array
|
||||
import logging
|
||||
import posixpath
|
||||
import warnings
|
||||
from collections.abc import MutableMapping
|
||||
from functools import cached_property
|
||||
|
||||
from fsspec.core import url_to_fs
|
||||
|
||||
logger = logging.getLogger("fsspec.mapping")
|
||||
|
||||
|
||||
class FSMap(MutableMapping):
|
||||
"""Wrap a FileSystem instance as a mutable wrapping.
|
||||
|
||||
The keys of the mapping become files under the given root, and the
|
||||
values (which must be bytes) the contents of those files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
root: string
|
||||
prefix for all the files
|
||||
fs: FileSystem instance
|
||||
check: bool (=True)
|
||||
performs a touch at the location, to check for write access.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> fs = FileSystem(**parameters) # doctest: +SKIP
|
||||
>>> d = FSMap('my-data/path/', fs) # doctest: +SKIP
|
||||
or, more likely
|
||||
>>> d = fs.get_mapper('my-data/path/')
|
||||
|
||||
>>> d['loc1'] = b'Hello World' # doctest: +SKIP
|
||||
>>> list(d.keys()) # doctest: +SKIP
|
||||
['loc1']
|
||||
>>> d['loc1'] # doctest: +SKIP
|
||||
b'Hello World'
|
||||
"""
|
||||
|
||||
def __init__(self, root, fs, check=False, create=False, missing_exceptions=None):
|
||||
self.fs = fs
|
||||
self.root = fs._strip_protocol(root)
|
||||
self._root_key_to_str = fs._strip_protocol(posixpath.join(root, "x"))[:-1]
|
||||
if missing_exceptions is None:
|
||||
missing_exceptions = (
|
||||
FileNotFoundError,
|
||||
IsADirectoryError,
|
||||
NotADirectoryError,
|
||||
)
|
||||
self.missing_exceptions = missing_exceptions
|
||||
self.check = check
|
||||
self.create = create
|
||||
if create:
|
||||
if not self.fs.exists(root):
|
||||
self.fs.mkdir(root)
|
||||
if check:
|
||||
if not self.fs.exists(root):
|
||||
raise ValueError(
|
||||
f"Path {root} does not exist. Create "
|
||||
f" with the ``create=True`` keyword"
|
||||
)
|
||||
self.fs.touch(root + "/a")
|
||||
self.fs.rm(root + "/a")
|
||||
|
||||
@cached_property
|
||||
def dirfs(self):
|
||||
"""dirfs instance that can be used with the same keys as the mapper"""
|
||||
from .implementations.dirfs import DirFileSystem
|
||||
|
||||
return DirFileSystem(path=self._root_key_to_str, fs=self.fs)
|
||||
|
||||
def clear(self):
|
||||
"""Remove all keys below root - empties out mapping"""
|
||||
logger.info("Clear mapping at %s", self.root)
|
||||
try:
|
||||
self.fs.rm(self.root, True)
|
||||
self.fs.mkdir(self.root)
|
||||
except: # noqa: E722
|
||||
pass
|
||||
|
||||
def getitems(self, keys, on_error="raise"):
|
||||
"""Fetch multiple items from the store
|
||||
|
||||
If the backend is async-able, this might proceed concurrently
|
||||
|
||||
Parameters
|
||||
----------
|
||||
keys: list(str)
|
||||
They keys to be fetched
|
||||
on_error : "raise", "omit", "return"
|
||||
If raise, an underlying exception will be raised (converted to KeyError
|
||||
if the type is in self.missing_exceptions); if omit, keys with exception
|
||||
will simply not be included in the output; if "return", all keys are
|
||||
included in the output, but the value will be bytes or an exception
|
||||
instance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict(key, bytes|exception)
|
||||
"""
|
||||
keys2 = [self._key_to_str(k) for k in keys]
|
||||
oe = on_error if on_error == "raise" else "return"
|
||||
try:
|
||||
out = self.fs.cat(keys2, on_error=oe)
|
||||
if isinstance(out, bytes):
|
||||
out = {keys2[0]: out}
|
||||
except self.missing_exceptions as e:
|
||||
raise KeyError from e
|
||||
out = {
|
||||
k: (KeyError() if isinstance(v, self.missing_exceptions) else v)
|
||||
for k, v in out.items()
|
||||
}
|
||||
return {
|
||||
key: out[k2] if on_error == "raise" else out.get(k2, KeyError(k2))
|
||||
for key, k2 in zip(keys, keys2)
|
||||
if on_error == "return" or not isinstance(out[k2], BaseException)
|
||||
}
|
||||
|
||||
def setitems(self, values_dict):
|
||||
"""Set the values of multiple items in the store
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values_dict: dict(str, bytes)
|
||||
"""
|
||||
values = {self._key_to_str(k): maybe_convert(v) for k, v in values_dict.items()}
|
||||
self.fs.pipe(values)
|
||||
|
||||
def delitems(self, keys):
|
||||
"""Remove multiple keys from the store"""
|
||||
self.fs.rm([self._key_to_str(k) for k in keys])
|
||||
|
||||
def _key_to_str(self, key):
|
||||
"""Generate full path for the key"""
|
||||
if not isinstance(key, str):
|
||||
# raise TypeError("key must be of type `str`, got `{type(key).__name__}`"
|
||||
warnings.warn(
|
||||
"from fsspec 2023.5 onward FSMap non-str keys will raise TypeError",
|
||||
DeprecationWarning,
|
||||
)
|
||||
if isinstance(key, list):
|
||||
key = tuple(key)
|
||||
key = str(key)
|
||||
return f"{self._root_key_to_str}{key}".rstrip("/")
|
||||
|
||||
def _str_to_key(self, s):
|
||||
"""Strip path of to leave key name"""
|
||||
return s[len(self.root) :].lstrip("/")
|
||||
|
||||
def __getitem__(self, key, default=None):
|
||||
"""Retrieve data"""
|
||||
k = self._key_to_str(key)
|
||||
try:
|
||||
result = self.fs.cat(k)
|
||||
except self.missing_exceptions as exc:
|
||||
if default is not None:
|
||||
return default
|
||||
raise KeyError(key) from exc
|
||||
return result
|
||||
|
||||
def pop(self, key, default=None):
|
||||
"""Pop data"""
|
||||
result = self.__getitem__(key, default)
|
||||
try:
|
||||
del self[key]
|
||||
except KeyError:
|
||||
pass
|
||||
return result
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
"""Store value in key"""
|
||||
key = self._key_to_str(key)
|
||||
self.fs.mkdirs(self.fs._parent(key), exist_ok=True)
|
||||
self.fs.pipe_file(key, maybe_convert(value))
|
||||
|
||||
def __iter__(self):
|
||||
return (self._str_to_key(x) for x in self.fs.find(self.root))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.fs.find(self.root))
|
||||
|
||||
def __delitem__(self, key):
|
||||
"""Remove key"""
|
||||
try:
|
||||
self.fs.rm(self._key_to_str(key))
|
||||
except Exception as exc:
|
||||
raise KeyError from exc
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Does key exist in mapping?"""
|
||||
path = self._key_to_str(key)
|
||||
return self.fs.isfile(path)
|
||||
|
||||
def __reduce__(self):
|
||||
return FSMap, (self.root, self.fs, False, False, self.missing_exceptions)
|
||||
|
||||
|
||||
def maybe_convert(value):
|
||||
if isinstance(value, array.array) or hasattr(value, "__array__"):
|
||||
# bytes-like things
|
||||
if hasattr(value, "dtype") and value.dtype.kind in "Mm":
|
||||
# The buffer interface doesn't support datetime64/timdelta64 numpy
|
||||
# arrays
|
||||
value = value.view("int64")
|
||||
value = bytes(memoryview(value))
|
||||
return value
|
||||
|
||||
|
||||
def get_mapper(
|
||||
url="",
|
||||
check=False,
|
||||
create=False,
|
||||
missing_exceptions=None,
|
||||
alternate_root=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Create key-value interface for given URL and options
|
||||
|
||||
The URL will be of the form "protocol://location" and point to the root
|
||||
of the mapper required. All keys will be file-names below this location,
|
||||
and their values the contents of each key.
|
||||
|
||||
Also accepts compound URLs like zip::s3://bucket/file.zip , see ``fsspec.open``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url: str
|
||||
Root URL of mapping
|
||||
check: bool
|
||||
Whether to attempt to read from the location before instantiation, to
|
||||
check that the mapping does exist
|
||||
create: bool
|
||||
Whether to make the directory corresponding to the root before
|
||||
instantiating
|
||||
missing_exceptions: None or tuple
|
||||
If given, these exception types will be regarded as missing keys and
|
||||
return KeyError when trying to read data. By default, you get
|
||||
(FileNotFoundError, IsADirectoryError, NotADirectoryError)
|
||||
alternate_root: None or str
|
||||
In cases of complex URLs, the parser may fail to pick the correct part
|
||||
for the mapper root, so this arg can override
|
||||
|
||||
Returns
|
||||
-------
|
||||
``FSMap`` instance, the dict-like key-value store.
|
||||
"""
|
||||
# Removing protocol here - could defer to each open() on the backend
|
||||
fs, urlpath = url_to_fs(url, **kwargs)
|
||||
root = alternate_root if alternate_root is not None else urlpath
|
||||
return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)
|
||||
541
.venv/lib/python3.10/site-packages/fsspec/parquet.py
Normal file
541
.venv/lib/python3.10/site-packages/fsspec/parquet.py
Normal file
@@ -0,0 +1,541 @@
|
||||
import io
|
||||
import json
|
||||
import warnings
|
||||
|
||||
from .core import url_to_fs
|
||||
from .utils import merge_offset_ranges
|
||||
|
||||
# Parquet-Specific Utilities for fsspec
|
||||
#
|
||||
# Most of the functions defined in this module are NOT
|
||||
# intended for public consumption. The only exception
|
||||
# to this is `open_parquet_file`, which should be used
|
||||
# place of `fs.open()` to open parquet-formatted files
|
||||
# on remote file systems.
|
||||
|
||||
|
||||
def open_parquet_file(
|
||||
path,
|
||||
mode="rb",
|
||||
fs=None,
|
||||
metadata=None,
|
||||
columns=None,
|
||||
row_groups=None,
|
||||
storage_options=None,
|
||||
strict=False,
|
||||
engine="auto",
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
footer_sample_size=1_000_000,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Return a file-like object for a single Parquet file.
|
||||
|
||||
The specified parquet `engine` will be used to parse the
|
||||
footer metadata, and determine the required byte ranges
|
||||
from the file. The target path will then be opened with
|
||||
the "parts" (`KnownPartsOfAFile`) caching strategy.
|
||||
|
||||
Note that this method is intended for usage with remote
|
||||
file systems, and is unlikely to improve parquet-read
|
||||
performance on local file systems.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path: str
|
||||
Target file path.
|
||||
mode: str, optional
|
||||
Mode option to be passed through to `fs.open`. Default is "rb".
|
||||
metadata: Any, optional
|
||||
Parquet metadata object. Object type must be supported
|
||||
by the backend parquet engine. For now, only the "fastparquet"
|
||||
engine supports an explicit `ParquetFile` metadata object.
|
||||
If a metadata object is supplied, the remote footer metadata
|
||||
will not need to be transferred into local memory.
|
||||
fs: AbstractFileSystem, optional
|
||||
Filesystem object to use for opening the file. If nothing is
|
||||
specified, an `AbstractFileSystem` object will be inferred.
|
||||
engine : str, default "auto"
|
||||
Parquet engine to use for metadata parsing. Allowed options
|
||||
include "fastparquet", "pyarrow", and "auto". The specified
|
||||
engine must be installed in the current environment. If
|
||||
"auto" is specified, and both engines are installed,
|
||||
"fastparquet" will take precedence over "pyarrow".
|
||||
columns: list, optional
|
||||
List of all column names that may be read from the file.
|
||||
row_groups : list, optional
|
||||
List of all row-groups that may be read from the file. This
|
||||
may be a list of row-group indices (integers), or it may be
|
||||
a list of `RowGroup` metadata objects (if the "fastparquet"
|
||||
engine is used).
|
||||
storage_options : dict, optional
|
||||
Used to generate an `AbstractFileSystem` object if `fs` was
|
||||
not specified.
|
||||
strict : bool, optional
|
||||
Whether the resulting `KnownPartsOfAFile` cache should
|
||||
fetch reads that go beyond a known byte-range boundary.
|
||||
If `False` (the default), any read that ends outside a
|
||||
known part will be zero padded. Note that using
|
||||
`strict=True` may be useful for debugging.
|
||||
max_gap : int, optional
|
||||
Neighboring byte ranges will only be merged when their
|
||||
inter-range gap is <= `max_gap`. Default is 64KB.
|
||||
max_block : int, optional
|
||||
Neighboring byte ranges will only be merged when the size of
|
||||
the aggregated range is <= `max_block`. Default is 256MB.
|
||||
footer_sample_size : int, optional
|
||||
Number of bytes to read from the end of the path to look
|
||||
for the footer metadata. If the sampled bytes do not contain
|
||||
the footer, a second read request will be required, and
|
||||
performance will suffer. Default is 1MB.
|
||||
**kwargs :
|
||||
Optional key-word arguments to pass to `fs.open`
|
||||
"""
|
||||
|
||||
# Make sure we have an `AbstractFileSystem` object
|
||||
# to work with
|
||||
if fs is None:
|
||||
fs = url_to_fs(path, **(storage_options or {}))[0]
|
||||
|
||||
# For now, `columns == []` not supported. Just use
|
||||
# default `open` command with `path` input
|
||||
if columns is not None and len(columns) == 0:
|
||||
return fs.open(path, mode=mode)
|
||||
|
||||
# Set the engine
|
||||
engine = _set_engine(engine)
|
||||
|
||||
# Fetch the known byte ranges needed to read
|
||||
# `columns` and/or `row_groups`
|
||||
data = _get_parquet_byte_ranges(
|
||||
[path],
|
||||
fs,
|
||||
metadata=metadata,
|
||||
columns=columns,
|
||||
row_groups=row_groups,
|
||||
engine=engine,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
footer_sample_size=footer_sample_size,
|
||||
)
|
||||
|
||||
# Extract file name from `data`
|
||||
fn = next(iter(data)) if data else path
|
||||
|
||||
# Call self.open with "parts" caching
|
||||
options = kwargs.pop("cache_options", {}).copy()
|
||||
return fs.open(
|
||||
fn,
|
||||
mode=mode,
|
||||
cache_type="parts",
|
||||
cache_options={
|
||||
**options,
|
||||
"data": data.get(fn, {}),
|
||||
"strict": strict,
|
||||
},
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def _get_parquet_byte_ranges(
|
||||
paths,
|
||||
fs,
|
||||
metadata=None,
|
||||
columns=None,
|
||||
row_groups=None,
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
footer_sample_size=1_000_000,
|
||||
engine="auto",
|
||||
):
|
||||
"""Get a dictionary of the known byte ranges needed
|
||||
to read a specific column/row-group selection from a
|
||||
Parquet dataset. Each value in the output dictionary
|
||||
is intended for use as the `data` argument for the
|
||||
`KnownPartsOfAFile` caching strategy of a single path.
|
||||
"""
|
||||
|
||||
# Set engine if necessary
|
||||
if isinstance(engine, str):
|
||||
engine = _set_engine(engine)
|
||||
|
||||
# Pass to specialized function if metadata is defined
|
||||
if metadata is not None:
|
||||
# Use the provided parquet metadata object
|
||||
# to avoid transferring/parsing footer metadata
|
||||
return _get_parquet_byte_ranges_from_metadata(
|
||||
metadata,
|
||||
fs,
|
||||
engine,
|
||||
columns=columns,
|
||||
row_groups=row_groups,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
)
|
||||
|
||||
# Get file sizes asynchronously
|
||||
file_sizes = fs.sizes(paths)
|
||||
|
||||
# Populate global paths, starts, & ends
|
||||
result = {}
|
||||
data_paths = []
|
||||
data_starts = []
|
||||
data_ends = []
|
||||
add_header_magic = True
|
||||
if columns is None and row_groups is None:
|
||||
# We are NOT selecting specific columns or row-groups.
|
||||
#
|
||||
# We can avoid sampling the footers, and just transfer
|
||||
# all file data with cat_ranges
|
||||
for i, path in enumerate(paths):
|
||||
result[path] = {}
|
||||
for b in range(0, file_sizes[i], max_block):
|
||||
data_paths.append(path)
|
||||
data_starts.append(b)
|
||||
data_ends.append(min(b + max_block, file_sizes[i]))
|
||||
add_header_magic = False # "Magic" should already be included
|
||||
else:
|
||||
# We ARE selecting specific columns or row-groups.
|
||||
#
|
||||
# Gather file footers.
|
||||
# We just take the last `footer_sample_size` bytes of each
|
||||
# file (or the entire file if it is smaller than that)
|
||||
footer_starts = []
|
||||
footer_ends = []
|
||||
for i, path in enumerate(paths):
|
||||
footer_ends.append(file_sizes[i])
|
||||
sample_size = max(0, file_sizes[i] - footer_sample_size)
|
||||
footer_starts.append(sample_size)
|
||||
footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends)
|
||||
|
||||
# Check our footer samples and re-sample if necessary.
|
||||
missing_footer_starts = footer_starts.copy()
|
||||
large_footer = 0
|
||||
for i, path in enumerate(paths):
|
||||
footer_size = int.from_bytes(footer_samples[i][-8:-4], "little")
|
||||
real_footer_start = file_sizes[i] - (footer_size + 8)
|
||||
if real_footer_start < footer_starts[i]:
|
||||
missing_footer_starts[i] = real_footer_start
|
||||
large_footer = max(large_footer, (footer_size + 8))
|
||||
if large_footer:
|
||||
warnings.warn(
|
||||
f"Not enough data was used to sample the parquet footer. "
|
||||
f"Try setting footer_sample_size >= {large_footer}."
|
||||
)
|
||||
for i, block in enumerate(
|
||||
fs.cat_ranges(
|
||||
paths,
|
||||
missing_footer_starts,
|
||||
footer_starts,
|
||||
)
|
||||
):
|
||||
footer_samples[i] = block + footer_samples[i]
|
||||
footer_starts[i] = missing_footer_starts[i]
|
||||
|
||||
# Calculate required byte ranges for each path
|
||||
for i, path in enumerate(paths):
|
||||
# Deal with small-file case.
|
||||
# Just include all remaining bytes of the file
|
||||
# in a single range.
|
||||
if file_sizes[i] < max_block:
|
||||
if footer_starts[i] > 0:
|
||||
# Only need to transfer the data if the
|
||||
# footer sample isn't already the whole file
|
||||
data_paths.append(path)
|
||||
data_starts.append(0)
|
||||
data_ends.append(footer_starts[i])
|
||||
continue
|
||||
|
||||
# Use "engine" to collect data byte ranges
|
||||
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
|
||||
columns,
|
||||
row_groups=row_groups,
|
||||
footer=footer_samples[i],
|
||||
footer_start=footer_starts[i],
|
||||
)
|
||||
|
||||
data_paths += [path] * len(path_data_starts)
|
||||
data_starts += path_data_starts
|
||||
data_ends += path_data_ends
|
||||
|
||||
# Merge adjacent offset ranges
|
||||
data_paths, data_starts, data_ends = merge_offset_ranges(
|
||||
data_paths,
|
||||
data_starts,
|
||||
data_ends,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
sort=False, # Should already be sorted
|
||||
)
|
||||
|
||||
# Start by populating `result` with footer samples
|
||||
for i, path in enumerate(paths):
|
||||
result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]}
|
||||
|
||||
# Transfer the data byte-ranges into local memory
|
||||
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
|
||||
|
||||
# Add b"PAR1" to header if necessary
|
||||
if add_header_magic:
|
||||
_add_header_magic(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _get_parquet_byte_ranges_from_metadata(
|
||||
metadata,
|
||||
fs,
|
||||
engine,
|
||||
columns=None,
|
||||
row_groups=None,
|
||||
max_gap=64_000,
|
||||
max_block=256_000_000,
|
||||
):
|
||||
"""Simplified version of `_get_parquet_byte_ranges` for
|
||||
the case that an engine-specific `metadata` object is
|
||||
provided, and the remote footer metadata does not need to
|
||||
be transferred before calculating the required byte ranges.
|
||||
"""
|
||||
|
||||
# Use "engine" to collect data byte ranges
|
||||
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
|
||||
columns,
|
||||
row_groups=row_groups,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
# Merge adjacent offset ranges
|
||||
data_paths, data_starts, data_ends = merge_offset_ranges(
|
||||
data_paths,
|
||||
data_starts,
|
||||
data_ends,
|
||||
max_gap=max_gap,
|
||||
max_block=max_block,
|
||||
sort=False, # Should be sorted
|
||||
)
|
||||
|
||||
# Transfer the data byte-ranges into local memory
|
||||
result = {fn: {} for fn in list(set(data_paths))}
|
||||
_transfer_ranges(fs, result, data_paths, data_starts, data_ends)
|
||||
|
||||
# Add b"PAR1" to header
|
||||
_add_header_magic(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _transfer_ranges(fs, blocks, paths, starts, ends):
|
||||
# Use cat_ranges to gather the data byte_ranges
|
||||
ranges = (paths, starts, ends)
|
||||
for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)):
|
||||
blocks[path][(start, stop)] = data
|
||||
|
||||
|
||||
def _add_header_magic(data):
|
||||
# Add b"PAR1" to file headers
|
||||
for path in list(data.keys()):
|
||||
add_magic = True
|
||||
for k in data[path]:
|
||||
if k[0] == 0 and k[1] >= 4:
|
||||
add_magic = False
|
||||
break
|
||||
if add_magic:
|
||||
data[path][(0, 4)] = b"PAR1"
|
||||
|
||||
|
||||
def _set_engine(engine_str):
|
||||
# Define a list of parquet engines to try
|
||||
if engine_str == "auto":
|
||||
try_engines = ("fastparquet", "pyarrow")
|
||||
elif not isinstance(engine_str, str):
|
||||
raise ValueError(
|
||||
"Failed to set parquet engine! "
|
||||
"Please pass 'fastparquet', 'pyarrow', or 'auto'"
|
||||
)
|
||||
elif engine_str not in ("fastparquet", "pyarrow"):
|
||||
raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`")
|
||||
else:
|
||||
try_engines = [engine_str]
|
||||
|
||||
# Try importing the engines in `try_engines`,
|
||||
# and choose the first one that succeeds
|
||||
for engine in try_engines:
|
||||
try:
|
||||
if engine == "fastparquet":
|
||||
return FastparquetEngine()
|
||||
elif engine == "pyarrow":
|
||||
return PyarrowEngine()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Raise an error if a supported parquet engine
|
||||
# was not found
|
||||
raise ImportError(
|
||||
f"The following parquet engines are not installed "
|
||||
f"in your python environment: {try_engines}."
|
||||
f"Please install 'fastparquert' or 'pyarrow' to "
|
||||
f"utilize the `fsspec.parquet` module."
|
||||
)
|
||||
|
||||
|
||||
class FastparquetEngine:
|
||||
# The purpose of the FastparquetEngine class is
|
||||
# to check if fastparquet can be imported (on initialization)
|
||||
# and to define a `_parquet_byte_ranges` method. In the
|
||||
# future, this class may also be used to define other
|
||||
# methods/logic that are specific to fastparquet.
|
||||
|
||||
def __init__(self):
|
||||
import fastparquet as fp
|
||||
|
||||
self.fp = fp
|
||||
|
||||
def _row_group_filename(self, row_group, pf):
|
||||
return pf.row_group_filename(row_group)
|
||||
|
||||
def _parquet_byte_ranges(
|
||||
self,
|
||||
columns,
|
||||
row_groups=None,
|
||||
metadata=None,
|
||||
footer=None,
|
||||
footer_start=None,
|
||||
):
|
||||
# Initialize offset ranges and define ParqetFile metadata
|
||||
pf = metadata
|
||||
data_paths, data_starts, data_ends = [], [], []
|
||||
if pf is None:
|
||||
pf = self.fp.ParquetFile(io.BytesIO(footer))
|
||||
|
||||
# Convert columns to a set and add any index columns
|
||||
# specified in the pandas metadata (just in case)
|
||||
column_set = None if columns is None else set(columns)
|
||||
if column_set is not None and hasattr(pf, "pandas_metadata"):
|
||||
md_index = [
|
||||
ind
|
||||
for ind in pf.pandas_metadata.get("index_columns", [])
|
||||
# Ignore RangeIndex information
|
||||
if not isinstance(ind, dict)
|
||||
]
|
||||
column_set |= set(md_index)
|
||||
|
||||
# Check if row_groups is a list of integers
|
||||
# or a list of row-group metadata
|
||||
if row_groups and not isinstance(row_groups[0], int):
|
||||
# Input row_groups contains row-group metadata
|
||||
row_group_indices = None
|
||||
else:
|
||||
# Input row_groups contains row-group indices
|
||||
row_group_indices = row_groups
|
||||
row_groups = pf.row_groups
|
||||
|
||||
# Loop through column chunks to add required byte ranges
|
||||
for r, row_group in enumerate(row_groups):
|
||||
# Skip this row-group if we are targeting
|
||||
# specific row-groups
|
||||
if row_group_indices is None or r in row_group_indices:
|
||||
# Find the target parquet-file path for `row_group`
|
||||
fn = self._row_group_filename(row_group, pf)
|
||||
|
||||
for column in row_group.columns:
|
||||
name = column.meta_data.path_in_schema[0]
|
||||
# Skip this column if we are targeting a
|
||||
# specific columns
|
||||
if column_set is None or name in column_set:
|
||||
file_offset0 = column.meta_data.dictionary_page_offset
|
||||
if file_offset0 is None:
|
||||
file_offset0 = column.meta_data.data_page_offset
|
||||
num_bytes = column.meta_data.total_compressed_size
|
||||
if footer_start is None or file_offset0 < footer_start:
|
||||
data_paths.append(fn)
|
||||
data_starts.append(file_offset0)
|
||||
data_ends.append(
|
||||
min(
|
||||
file_offset0 + num_bytes,
|
||||
footer_start or (file_offset0 + num_bytes),
|
||||
)
|
||||
)
|
||||
|
||||
if metadata:
|
||||
# The metadata in this call may map to multiple
|
||||
# file paths. Need to include `data_paths`
|
||||
return data_paths, data_starts, data_ends
|
||||
return data_starts, data_ends
|
||||
|
||||
|
||||
class PyarrowEngine:
|
||||
# The purpose of the PyarrowEngine class is
|
||||
# to check if pyarrow can be imported (on initialization)
|
||||
# and to define a `_parquet_byte_ranges` method. In the
|
||||
# future, this class may also be used to define other
|
||||
# methods/logic that are specific to pyarrow.
|
||||
|
||||
def __init__(self):
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
self.pq = pq
|
||||
|
||||
def _row_group_filename(self, row_group, metadata):
|
||||
raise NotImplementedError
|
||||
|
||||
def _parquet_byte_ranges(
|
||||
self,
|
||||
columns,
|
||||
row_groups=None,
|
||||
metadata=None,
|
||||
footer=None,
|
||||
footer_start=None,
|
||||
):
|
||||
if metadata is not None:
|
||||
raise ValueError("metadata input not supported for PyarrowEngine")
|
||||
|
||||
data_starts, data_ends = [], []
|
||||
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
|
||||
|
||||
# Convert columns to a set and add any index columns
|
||||
# specified in the pandas metadata (just in case)
|
||||
column_set = None if columns is None else set(columns)
|
||||
if column_set is not None:
|
||||
schema = md.schema.to_arrow_schema()
|
||||
has_pandas_metadata = (
|
||||
schema.metadata is not None and b"pandas" in schema.metadata
|
||||
)
|
||||
if has_pandas_metadata:
|
||||
md_index = [
|
||||
ind
|
||||
for ind in json.loads(
|
||||
schema.metadata[b"pandas"].decode("utf8")
|
||||
).get("index_columns", [])
|
||||
# Ignore RangeIndex information
|
||||
if not isinstance(ind, dict)
|
||||
]
|
||||
column_set |= set(md_index)
|
||||
|
||||
# Loop through column chunks to add required byte ranges
|
||||
for r in range(md.num_row_groups):
|
||||
# Skip this row-group if we are targeting
|
||||
# specific row-groups
|
||||
if row_groups is None or r in row_groups:
|
||||
row_group = md.row_group(r)
|
||||
for c in range(row_group.num_columns):
|
||||
column = row_group.column(c)
|
||||
name = column.path_in_schema
|
||||
# Skip this column if we are targeting a
|
||||
# specific columns
|
||||
split_name = name.split(".")[0]
|
||||
if (
|
||||
column_set is None
|
||||
or name in column_set
|
||||
or split_name in column_set
|
||||
):
|
||||
file_offset0 = column.dictionary_page_offset
|
||||
if file_offset0 is None:
|
||||
file_offset0 = column.data_page_offset
|
||||
num_bytes = column.total_compressed_size
|
||||
if file_offset0 < footer_start:
|
||||
data_starts.append(file_offset0)
|
||||
data_ends.append(
|
||||
min(file_offset0 + num_bytes, footer_start)
|
||||
)
|
||||
return data_starts, data_ends
|
||||
318
.venv/lib/python3.10/site-packages/fsspec/registry.py
Normal file
318
.venv/lib/python3.10/site-packages/fsspec/registry.py
Normal file
@@ -0,0 +1,318 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import types
|
||||
import warnings
|
||||
|
||||
__all__ = ["registry", "get_filesystem_class", "default"]
|
||||
|
||||
# internal, mutable
|
||||
_registry: dict[str, type] = {}
|
||||
|
||||
# external, immutable
|
||||
registry = types.MappingProxyType(_registry)
|
||||
default = "file"
|
||||
|
||||
|
||||
def register_implementation(name, cls, clobber=False, errtxt=None):
|
||||
"""Add implementation class to the registry
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name: str
|
||||
Protocol name to associate with the class
|
||||
cls: class or str
|
||||
if a class: fsspec-compliant implementation class (normally inherits from
|
||||
``fsspec.AbstractFileSystem``, gets added straight to the registry. If a
|
||||
str, the full path to an implementation class like package.module.class,
|
||||
which gets added to known_implementations,
|
||||
so the import is deferred until the filesystem is actually used.
|
||||
clobber: bool (optional)
|
||||
Whether to overwrite a protocol with the same name; if False, will raise
|
||||
instead.
|
||||
errtxt: str (optional)
|
||||
If given, then a failure to import the given class will result in this
|
||||
text being given.
|
||||
"""
|
||||
if isinstance(cls, str):
|
||||
if name in known_implementations and clobber is False:
|
||||
if cls != known_implementations[name]["class"]:
|
||||
raise ValueError(
|
||||
f"Name ({name}) already in the known_implementations and clobber "
|
||||
f"is False"
|
||||
)
|
||||
else:
|
||||
known_implementations[name] = {
|
||||
"class": cls,
|
||||
"err": errtxt or f"{cls} import failed for protocol {name}",
|
||||
}
|
||||
|
||||
else:
|
||||
if name in registry and clobber is False:
|
||||
if _registry[name] is not cls:
|
||||
raise ValueError(
|
||||
f"Name ({name}) already in the registry and clobber is False"
|
||||
)
|
||||
else:
|
||||
_registry[name] = cls
|
||||
|
||||
|
||||
# protocols mapped to the class which implements them. This dict can be
|
||||
# updated with register_implementation
|
||||
known_implementations = {
|
||||
"abfs": {
|
||||
"class": "adlfs.AzureBlobFileSystem",
|
||||
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
|
||||
},
|
||||
"adl": {
|
||||
"class": "adlfs.AzureDatalakeFileSystem",
|
||||
"err": "Install adlfs to access Azure Datalake Gen1",
|
||||
},
|
||||
"arrow_hdfs": {
|
||||
"class": "fsspec.implementations.arrow.HadoopFileSystem",
|
||||
"err": "pyarrow and local java libraries required for HDFS",
|
||||
},
|
||||
"async_wrapper": {
|
||||
"class": "fsspec.asyn_wrapper.AsyncWrapperFileSystem",
|
||||
},
|
||||
"asynclocal": {
|
||||
"class": "morefs.asyn_local.AsyncLocalFileSystem",
|
||||
"err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
|
||||
},
|
||||
"az": {
|
||||
"class": "adlfs.AzureBlobFileSystem",
|
||||
"err": "Install adlfs to access Azure Datalake Gen2 and Azure Blob Storage",
|
||||
},
|
||||
"blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"},
|
||||
"box": {
|
||||
"class": "boxfs.BoxFileSystem",
|
||||
"err": "Please install boxfs to access BoxFileSystem",
|
||||
},
|
||||
"cached": {"class": "fsspec.implementations.cached.CachingFileSystem"},
|
||||
"dask": {
|
||||
"class": "fsspec.implementations.dask.DaskWorkerFileSystem",
|
||||
"err": "Install dask distributed to access worker file system",
|
||||
},
|
||||
"data": {"class": "fsspec.implementations.data.DataFileSystem"},
|
||||
"dbfs": {
|
||||
"class": "fsspec.implementations.dbfs.DatabricksFileSystem",
|
||||
"err": "Install the requests package to use the DatabricksFileSystem",
|
||||
},
|
||||
"dir": {"class": "fsspec.implementations.dirfs.DirFileSystem"},
|
||||
"dropbox": {
|
||||
"class": "dropboxdrivefs.DropboxDriveFileSystem",
|
||||
"err": (
|
||||
'DropboxFileSystem requires "dropboxdrivefs","requests" and "'
|
||||
'"dropbox" to be installed'
|
||||
),
|
||||
},
|
||||
"dvc": {
|
||||
"class": "dvc.api.DVCFileSystem",
|
||||
"err": "Install dvc to access DVCFileSystem",
|
||||
},
|
||||
"file": {"class": "fsspec.implementations.local.LocalFileSystem"},
|
||||
"filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"},
|
||||
"ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"},
|
||||
"gcs": {
|
||||
"class": "gcsfs.GCSFileSystem",
|
||||
"err": "Please install gcsfs to access Google Storage",
|
||||
},
|
||||
"gdrive": {
|
||||
"class": "gdrivefs.GoogleDriveFileSystem",
|
||||
"err": "Please install gdrivefs for access to Google Drive",
|
||||
},
|
||||
"generic": {"class": "fsspec.generic.GenericFileSystem"},
|
||||
"git": {
|
||||
"class": "fsspec.implementations.git.GitFileSystem",
|
||||
"err": "Install pygit2 to browse local git repos",
|
||||
},
|
||||
"github": {
|
||||
"class": "fsspec.implementations.github.GithubFileSystem",
|
||||
"err": "Install the requests package to use the github FS",
|
||||
},
|
||||
"gs": {
|
||||
"class": "gcsfs.GCSFileSystem",
|
||||
"err": "Please install gcsfs to access Google Storage",
|
||||
},
|
||||
"hdfs": {
|
||||
"class": "fsspec.implementations.arrow.HadoopFileSystem",
|
||||
"err": "pyarrow and local java libraries required for HDFS",
|
||||
},
|
||||
"hf": {
|
||||
"class": "huggingface_hub.HfFileSystem",
|
||||
"err": "Install huggingface_hub to access HfFileSystem",
|
||||
},
|
||||
"http": {
|
||||
"class": "fsspec.implementations.http.HTTPFileSystem",
|
||||
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
|
||||
},
|
||||
"https": {
|
||||
"class": "fsspec.implementations.http.HTTPFileSystem",
|
||||
"err": 'HTTPFileSystem requires "requests" and "aiohttp" to be installed',
|
||||
},
|
||||
"jlab": {
|
||||
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
|
||||
"err": "Jupyter FS requires requests to be installed",
|
||||
},
|
||||
"jupyter": {
|
||||
"class": "fsspec.implementations.jupyter.JupyterFileSystem",
|
||||
"err": "Jupyter FS requires requests to be installed",
|
||||
},
|
||||
"lakefs": {
|
||||
"class": "lakefs_spec.LakeFSFileSystem",
|
||||
"err": "Please install lakefs-spec to access LakeFSFileSystem",
|
||||
},
|
||||
"libarchive": {
|
||||
"class": "fsspec.implementations.libarchive.LibArchiveFileSystem",
|
||||
"err": "LibArchive requires to be installed",
|
||||
},
|
||||
"local": {"class": "fsspec.implementations.local.LocalFileSystem"},
|
||||
"memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"},
|
||||
"oci": {
|
||||
"class": "ocifs.OCIFileSystem",
|
||||
"err": "Install ocifs to access OCI Object Storage",
|
||||
},
|
||||
"ocilake": {
|
||||
"class": "ocifs.OCIFileSystem",
|
||||
"err": "Install ocifs to access OCI Data Lake",
|
||||
},
|
||||
"oss": {
|
||||
"class": "ossfs.OSSFileSystem",
|
||||
"err": "Install ossfs to access Alibaba Object Storage System",
|
||||
},
|
||||
"reference": {"class": "fsspec.implementations.reference.ReferenceFileSystem"},
|
||||
"root": {
|
||||
"class": "fsspec_xrootd.XRootDFileSystem",
|
||||
"err": (
|
||||
"Install fsspec-xrootd to access xrootd storage system. "
|
||||
"Note: 'root' is the protocol name for xrootd storage systems, "
|
||||
"not referring to root directories"
|
||||
),
|
||||
},
|
||||
"s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
|
||||
"s3a": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"},
|
||||
"sftp": {
|
||||
"class": "fsspec.implementations.sftp.SFTPFileSystem",
|
||||
"err": 'SFTPFileSystem requires "paramiko" to be installed',
|
||||
},
|
||||
"simplecache": {"class": "fsspec.implementations.cached.SimpleCacheFileSystem"},
|
||||
"smb": {
|
||||
"class": "fsspec.implementations.smb.SMBFileSystem",
|
||||
"err": 'SMB requires "smbprotocol" or "smbprotocol[kerberos]" installed',
|
||||
},
|
||||
"ssh": {
|
||||
"class": "fsspec.implementations.sftp.SFTPFileSystem",
|
||||
"err": 'SFTPFileSystem requires "paramiko" to be installed',
|
||||
},
|
||||
"tar": {"class": "fsspec.implementations.tar.TarFileSystem"},
|
||||
"tosfs": {
|
||||
"class": "tosfs.TosFileSystem",
|
||||
"err": "Install tosfs to access ByteDance volcano engine Tinder Object Storage",
|
||||
},
|
||||
"wandb": {"class": "wandbfs.WandbFS", "err": "Install wandbfs to access wandb"},
|
||||
"webdav": {
|
||||
"class": "webdav4.fsspec.WebdavFileSystem",
|
||||
"err": "Install webdav4 to access WebDAV",
|
||||
},
|
||||
"webhdfs": {
|
||||
"class": "fsspec.implementations.webhdfs.WebHDFS",
|
||||
"err": 'webHDFS access requires "requests" to be installed',
|
||||
},
|
||||
"zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
|
||||
}
|
||||
|
||||
assert list(known_implementations) == sorted(known_implementations), (
|
||||
"Not in alphabetical order"
|
||||
)
|
||||
|
||||
|
||||
def get_filesystem_class(protocol):
|
||||
"""Fetch named protocol implementation from the registry
|
||||
|
||||
The dict ``known_implementations`` maps protocol names to the locations
|
||||
of classes implementing the corresponding file-system. When used for the
|
||||
first time, appropriate imports will happen and the class will be placed in
|
||||
the registry. All subsequent calls will fetch directly from the registry.
|
||||
|
||||
Some protocol implementations require additional dependencies, and so the
|
||||
import may fail. In this case, the string in the "err" field of the
|
||||
``known_implementations`` will be given as the error message.
|
||||
"""
|
||||
if not protocol:
|
||||
protocol = default
|
||||
|
||||
if protocol not in registry:
|
||||
if protocol not in known_implementations:
|
||||
raise ValueError(f"Protocol not known: {protocol}")
|
||||
bit = known_implementations[protocol]
|
||||
try:
|
||||
register_implementation(protocol, _import_class(bit["class"]))
|
||||
except ImportError as e:
|
||||
raise ImportError(bit.get("err")) from e
|
||||
cls = registry[protocol]
|
||||
if getattr(cls, "protocol", None) in ("abstract", None):
|
||||
cls.protocol = protocol
|
||||
|
||||
return cls
|
||||
|
||||
|
||||
s3_msg = """Your installed version of s3fs is very old and known to cause
|
||||
severe performance issues, see also https://github.com/dask/dask/issues/10276
|
||||
|
||||
To fix, you should specify a lower version bound on s3fs, or
|
||||
update the current installation.
|
||||
"""
|
||||
|
||||
|
||||
def _import_class(fqp: str):
|
||||
"""Take a fully-qualified path and return the imported class or identifier.
|
||||
|
||||
``fqp`` is of the form "package.module.klass" or
|
||||
"package.module:subobject.klass".
|
||||
|
||||
Warnings
|
||||
--------
|
||||
This can import arbitrary modules. Make sure you haven't installed any modules
|
||||
that may execute malicious code at import time.
|
||||
"""
|
||||
if ":" in fqp:
|
||||
mod, name = fqp.rsplit(":", 1)
|
||||
else:
|
||||
mod, name = fqp.rsplit(".", 1)
|
||||
|
||||
is_s3 = mod == "s3fs"
|
||||
mod = importlib.import_module(mod)
|
||||
if is_s3 and mod.__version__.split(".") < ["0", "5"]:
|
||||
warnings.warn(s3_msg)
|
||||
for part in name.split("."):
|
||||
mod = getattr(mod, part)
|
||||
|
||||
if not isinstance(mod, type):
|
||||
raise TypeError(f"{fqp} is not a class")
|
||||
|
||||
return mod
|
||||
|
||||
|
||||
def filesystem(protocol, **storage_options):
|
||||
"""Instantiate filesystems for given protocol and arguments
|
||||
|
||||
``storage_options`` are specific to the protocol being chosen, and are
|
||||
passed directly to the class.
|
||||
"""
|
||||
if protocol == "arrow_hdfs":
|
||||
warnings.warn(
|
||||
"The 'arrow_hdfs' protocol has been deprecated and will be "
|
||||
"removed in the future. Specify it as 'hdfs'.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
cls = get_filesystem_class(protocol)
|
||||
return cls(**storage_options)
|
||||
|
||||
|
||||
def available_protocols():
|
||||
"""Return a list of the implemented protocols.
|
||||
|
||||
Note that any given protocol may require extra packages to be importable.
|
||||
"""
|
||||
return list(known_implementations)
|
||||
2242
.venv/lib/python3.10/site-packages/fsspec/spec.py
Normal file
2242
.venv/lib/python3.10/site-packages/fsspec/spec.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,289 @@
|
||||
import os
|
||||
from hashlib import md5
|
||||
|
||||
import pytest
|
||||
|
||||
from fsspec.implementations.local import LocalFileSystem
|
||||
from fsspec.tests.abstract.copy import AbstractCopyTests # noqa: F401
|
||||
from fsspec.tests.abstract.get import AbstractGetTests # noqa: F401
|
||||
from fsspec.tests.abstract.open import AbstractOpenTests # noqa: F401
|
||||
from fsspec.tests.abstract.pipe import AbstractPipeTests # noqa: F401
|
||||
from fsspec.tests.abstract.put import AbstractPutTests # noqa: F401
|
||||
|
||||
|
||||
class BaseAbstractFixtures:
|
||||
"""
|
||||
Abstract base class containing fixtures that are used by but never need to
|
||||
be overridden in derived filesystem-specific classes to run the abstract
|
||||
tests on such filesystems.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def fs_bulk_operations_scenario_0(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Scenario on remote filesystem that is used for many cp/get/put tests.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._bulk_operations_scenario_0(fs, fs_join, fs_path)
|
||||
yield source
|
||||
fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def fs_glob_edge_cases_files(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Scenario on remote filesystem that is used for glob edge cases cp/get/put tests.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._glob_edge_cases_files(fs, fs_join, fs_path)
|
||||
yield source
|
||||
fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def fs_dir_and_file_with_same_name_prefix(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Scenario on remote filesystem that is used to check cp/get/put on directory
|
||||
and file with the same name prefixes.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._dir_and_file_with_same_name_prefix(fs, fs_join, fs_path)
|
||||
yield source
|
||||
fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def fs_10_files_with_hashed_names(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Scenario on remote filesystem that is used to check cp/get/put files order
|
||||
when source and destination are lists.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._10_files_with_hashed_names(fs, fs_join, fs_path)
|
||||
yield source
|
||||
fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def fs_target(self, fs, fs_join, fs_path):
|
||||
"""
|
||||
Return name of remote directory that does not yet exist to copy into.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
target = fs_join(fs_path, "target")
|
||||
yield target
|
||||
if fs.exists(target):
|
||||
fs.rm(target, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_bulk_operations_scenario_0(self, local_fs, local_join, local_path):
|
||||
"""
|
||||
Scenario on local filesystem that is used for many cp/get/put tests.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._bulk_operations_scenario_0(local_fs, local_join, local_path)
|
||||
yield source
|
||||
local_fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_glob_edge_cases_files(self, local_fs, local_join, local_path):
|
||||
"""
|
||||
Scenario on local filesystem that is used for glob edge cases cp/get/put tests.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._glob_edge_cases_files(local_fs, local_join, local_path)
|
||||
yield source
|
||||
local_fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_dir_and_file_with_same_name_prefix(
|
||||
self, local_fs, local_join, local_path
|
||||
):
|
||||
"""
|
||||
Scenario on local filesystem that is used to check cp/get/put on directory
|
||||
and file with the same name prefixes.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._dir_and_file_with_same_name_prefix(
|
||||
local_fs, local_join, local_path
|
||||
)
|
||||
yield source
|
||||
local_fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_10_files_with_hashed_names(self, local_fs, local_join, local_path):
|
||||
"""
|
||||
Scenario on local filesystem that is used to check cp/get/put files order
|
||||
when source and destination are lists.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
source = self._10_files_with_hashed_names(local_fs, local_join, local_path)
|
||||
yield source
|
||||
local_fs.rm(source, recursive=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_target(self, local_fs, local_join, local_path):
|
||||
"""
|
||||
Return name of local directory that does not yet exist to copy into.
|
||||
|
||||
Cleans up at the end of each test it which it is used.
|
||||
"""
|
||||
target = local_join(local_path, "target")
|
||||
yield target
|
||||
if local_fs.exists(target):
|
||||
local_fs.rm(target, recursive=True)
|
||||
|
||||
def _glob_edge_cases_files(self, some_fs, some_join, some_path):
|
||||
"""
|
||||
Scenario that is used for glob edge cases cp/get/put tests.
|
||||
Creates the following directory and file structure:
|
||||
|
||||
📁 source
|
||||
├── 📄 file1
|
||||
├── 📄 file2
|
||||
├── 📁 subdir0
|
||||
│ ├── 📄 subfile1
|
||||
│ ├── 📄 subfile2
|
||||
│ └── 📁 nesteddir
|
||||
│ └── 📄 nestedfile
|
||||
└── 📁 subdir1
|
||||
├── 📄 subfile1
|
||||
├── 📄 subfile2
|
||||
└── 📁 nesteddir
|
||||
└── 📄 nestedfile
|
||||
"""
|
||||
source = some_join(some_path, "source")
|
||||
some_fs.touch(some_join(source, "file1"))
|
||||
some_fs.touch(some_join(source, "file2"))
|
||||
|
||||
for subdir_idx in range(2):
|
||||
subdir = some_join(source, f"subdir{subdir_idx}")
|
||||
nesteddir = some_join(subdir, "nesteddir")
|
||||
some_fs.makedirs(nesteddir)
|
||||
some_fs.touch(some_join(subdir, "subfile1"))
|
||||
some_fs.touch(some_join(subdir, "subfile2"))
|
||||
some_fs.touch(some_join(nesteddir, "nestedfile"))
|
||||
|
||||
return source
|
||||
|
||||
def _bulk_operations_scenario_0(self, some_fs, some_join, some_path):
|
||||
"""
|
||||
Scenario that is used for many cp/get/put tests. Creates the following
|
||||
directory and file structure:
|
||||
|
||||
📁 source
|
||||
├── 📄 file1
|
||||
├── 📄 file2
|
||||
└── 📁 subdir
|
||||
├── 📄 subfile1
|
||||
├── 📄 subfile2
|
||||
└── 📁 nesteddir
|
||||
└── 📄 nestedfile
|
||||
"""
|
||||
source = some_join(some_path, "source")
|
||||
subdir = some_join(source, "subdir")
|
||||
nesteddir = some_join(subdir, "nesteddir")
|
||||
some_fs.makedirs(nesteddir)
|
||||
some_fs.touch(some_join(source, "file1"))
|
||||
some_fs.touch(some_join(source, "file2"))
|
||||
some_fs.touch(some_join(subdir, "subfile1"))
|
||||
some_fs.touch(some_join(subdir, "subfile2"))
|
||||
some_fs.touch(some_join(nesteddir, "nestedfile"))
|
||||
return source
|
||||
|
||||
def _dir_and_file_with_same_name_prefix(self, some_fs, some_join, some_path):
|
||||
"""
|
||||
Scenario that is used to check cp/get/put on directory and file with
|
||||
the same name prefixes. Creates the following directory and file structure:
|
||||
|
||||
📁 source
|
||||
├── 📄 subdir.txt
|
||||
└── 📁 subdir
|
||||
└── 📄 subfile.txt
|
||||
"""
|
||||
source = some_join(some_path, "source")
|
||||
subdir = some_join(source, "subdir")
|
||||
file = some_join(source, "subdir.txt")
|
||||
subfile = some_join(subdir, "subfile.txt")
|
||||
some_fs.makedirs(subdir)
|
||||
some_fs.touch(file)
|
||||
some_fs.touch(subfile)
|
||||
return source
|
||||
|
||||
def _10_files_with_hashed_names(self, some_fs, some_join, some_path):
|
||||
"""
|
||||
Scenario that is used to check cp/get/put files order when source and
|
||||
destination are lists. Creates the following directory and file structure:
|
||||
|
||||
📁 source
|
||||
└── 📄 {hashed([0-9])}.txt
|
||||
"""
|
||||
source = some_join(some_path, "source")
|
||||
for i in range(10):
|
||||
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
|
||||
path = some_join(source, f"{hashed_i}.txt")
|
||||
some_fs.pipe(path=path, value=f"{i}".encode())
|
||||
return source
|
||||
|
||||
|
||||
class AbstractFixtures(BaseAbstractFixtures):
|
||||
"""
|
||||
Abstract base class containing fixtures that may be overridden in derived
|
||||
filesystem-specific classes to run the abstract tests on such filesystems.
|
||||
|
||||
For any particular filesystem some of these fixtures must be overridden,
|
||||
such as ``fs`` and ``fs_path``, and others may be overridden if the
|
||||
default functions here are not appropriate, such as ``fs_join``.
|
||||
"""
|
||||
|
||||
@pytest.fixture
|
||||
def fs(self):
|
||||
raise NotImplementedError("This function must be overridden in derived classes")
|
||||
|
||||
@pytest.fixture
|
||||
def fs_join(self):
|
||||
"""
|
||||
Return a function that joins its arguments together into a path.
|
||||
|
||||
Most fsspec implementations join paths in a platform-dependent way,
|
||||
but some will override this to always use a forward slash.
|
||||
"""
|
||||
return os.path.join
|
||||
|
||||
@pytest.fixture
|
||||
def fs_path(self):
|
||||
raise NotImplementedError("This function must be overridden in derived classes")
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def local_fs(self):
|
||||
# Maybe need an option for auto_mkdir=False? This is only relevant
|
||||
# for certain implementations.
|
||||
return LocalFileSystem(auto_mkdir=True)
|
||||
|
||||
@pytest.fixture
|
||||
def local_join(self):
|
||||
"""
|
||||
Return a function that joins its arguments together into a path, on
|
||||
the local filesystem.
|
||||
"""
|
||||
return os.path.join
|
||||
|
||||
@pytest.fixture
|
||||
def local_path(self, tmpdir):
|
||||
return tmpdir
|
||||
|
||||
@pytest.fixture
|
||||
def supports_empty_directories(self):
|
||||
"""
|
||||
Return whether this implementation supports empty directories.
|
||||
"""
|
||||
return True
|
||||
|
||||
@pytest.fixture
|
||||
def fs_sanitize_path(self):
|
||||
return lambda x: x
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user