feat: added http header support for document converter and cli (#642)
* added http header support for document converter and cli Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com> * fixed formatting and typing issues Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com> * use pydantic to parse dict suggested by @dolfim-ibm Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Luke Harrison <luke.harrison1@ibm.com> --------- Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com> Signed-off-by: Luke Harrison <luke.harrison1@ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
parent
569038df42
commit
0ee849e8bc
@ -164,6 +164,11 @@ def convert(
|
|||||||
to_formats: List[OutputFormat] = typer.Option(
|
to_formats: List[OutputFormat] = typer.Option(
|
||||||
None, "--to", help="Specify output formats. Defaults to Markdown."
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
||||||
),
|
),
|
||||||
|
headers: str = typer.Option(
|
||||||
|
None,
|
||||||
|
"--headers",
|
||||||
|
help="Specify http request headers used when fetching url input sources in the form of a JSON string",
|
||||||
|
),
|
||||||
image_export_mode: Annotated[
|
image_export_mode: Annotated[
|
||||||
ImageRefMode,
|
ImageRefMode,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@ -279,12 +284,19 @@ def convert(
|
|||||||
if from_formats is None:
|
if from_formats is None:
|
||||||
from_formats = [e for e in InputFormat]
|
from_formats = [e for e in InputFormat]
|
||||||
|
|
||||||
|
parsed_headers: Optional[Dict[str, str]] = None
|
||||||
|
if headers is not None:
|
||||||
|
headers_t = TypeAdapter(Dict[str, str])
|
||||||
|
parsed_headers = headers_t.validate_json(headers)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tempdir:
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
input_doc_paths: List[Path] = []
|
input_doc_paths: List[Path] = []
|
||||||
for src in input_sources:
|
for src in input_sources:
|
||||||
try:
|
try:
|
||||||
# check if we can fetch some remote url
|
# check if we can fetch some remote url
|
||||||
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
source = resolve_source_to_path(
|
||||||
|
source=src, headers=parsed_headers, workdir=Path(tempdir)
|
||||||
|
)
|
||||||
input_doc_paths.append(source)
|
input_doc_paths.append(source)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
err_console.print(
|
err_console.print(
|
||||||
@ -390,7 +402,7 @@ def convert(
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert_all(
|
conv_results = doc_converter.convert_all(
|
||||||
input_doc_paths, raises_on_error=abort_on_error
|
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
||||||
)
|
)
|
||||||
|
|
||||||
output.mkdir(parents=True, exist_ok=True)
|
output.mkdir(parents=True, exist_ok=True)
|
||||||
|
@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
|
|||||||
class _DocumentConversionInput(BaseModel):
|
class _DocumentConversionInput(BaseModel):
|
||||||
|
|
||||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||||
|
headers: Optional[Dict[str, str]] = None
|
||||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||||
|
|
||||||
def docs(
|
def docs(
|
||||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||||
) -> Iterable[InputDocument]:
|
) -> Iterable[InputDocument]:
|
||||||
for item in self.path_or_stream_iterator:
|
for item in self.path_or_stream_iterator:
|
||||||
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
obj = (
|
||||||
|
resolve_source_to_stream(item, self.headers)
|
||||||
|
if isinstance(item, str)
|
||||||
|
else item
|
||||||
|
)
|
||||||
format = self._guess_format(obj)
|
format = self._guess_format(obj)
|
||||||
backend: Type[AbstractDocumentBackend]
|
backend: Type[AbstractDocumentBackend]
|
||||||
if format not in format_options.keys():
|
if format not in format_options.keys():
|
||||||
|
@ -176,6 +176,7 @@ class DocumentConverter:
|
|||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
source: Union[Path, str, DocumentStream], # TODO review naming
|
source: Union[Path, str, DocumentStream], # TODO review naming
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
raises_on_error: bool = True,
|
raises_on_error: bool = True,
|
||||||
max_num_pages: int = sys.maxsize,
|
max_num_pages: int = sys.maxsize,
|
||||||
max_file_size: int = sys.maxsize,
|
max_file_size: int = sys.maxsize,
|
||||||
@ -185,6 +186,7 @@ class DocumentConverter:
|
|||||||
raises_on_error=raises_on_error,
|
raises_on_error=raises_on_error,
|
||||||
max_num_pages=max_num_pages,
|
max_num_pages=max_num_pages,
|
||||||
max_file_size=max_file_size,
|
max_file_size=max_file_size,
|
||||||
|
headers=headers,
|
||||||
)
|
)
|
||||||
return next(all_res)
|
return next(all_res)
|
||||||
|
|
||||||
@ -192,6 +194,7 @@ class DocumentConverter:
|
|||||||
def convert_all(
|
def convert_all(
|
||||||
self,
|
self,
|
||||||
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||||
max_num_pages: int = sys.maxsize,
|
max_num_pages: int = sys.maxsize,
|
||||||
max_file_size: int = sys.maxsize,
|
max_file_size: int = sys.maxsize,
|
||||||
@ -201,8 +204,7 @@ class DocumentConverter:
|
|||||||
max_file_size=max_file_size,
|
max_file_size=max_file_size,
|
||||||
)
|
)
|
||||||
conv_input = _DocumentConversionInput(
|
conv_input = _DocumentConversionInput(
|
||||||
path_or_stream_iterator=source,
|
path_or_stream_iterator=source, limits=limits, headers=headers
|
||||||
limits=limits,
|
|
||||||
)
|
)
|
||||||
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user