feat: added http header support for document converter and cli (#642)

* added http header support for document converter and cli Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com> * fixed formatting and typing issues Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com> * use pydantic to parse dict suggested by @dolfim-ibm Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Luke Harrison <luke.harrison1@ibm.com> --------- Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com> Signed-off-by: Luke Harrison <luke.harrison1@ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
2025-01-07 04:15:14 -05:00 · 2025-01-07 04:15:14 -05:00 · 0ee849e8bc
commit 0ee849e8bc
parent 569038df42
3 changed files with 24 additions and 5 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -164,6 +164,11 @@ def convert(
    to_formats: List[OutputFormat] = typer.Option(
        None, "--to", help="Specify output formats. Defaults to Markdown."
    ),
+    headers: str = typer.Option(
+        None,
+        "--headers",
+        help="Specify http request headers used when fetching url input sources in the form of a JSON string",
+    ),
    image_export_mode: Annotated[
        ImageRefMode,
        typer.Option(
@ -279,12 +284,19 @@ def convert(
    if from_formats is None:
        from_formats = [e for e in InputFormat]

+    parsed_headers: Optional[Dict[str, str]] = None
+    if headers is not None:
+        headers_t = TypeAdapter(Dict[str, str])
+        parsed_headers = headers_t.validate_json(headers)
+
    with tempfile.TemporaryDirectory() as tempdir:
        input_doc_paths: List[Path] = []
        for src in input_sources:
            try:
                # check if we can fetch some remote url
-                source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+                source = resolve_source_to_path(
+                    source=src, headers=parsed_headers, workdir=Path(tempdir)
+                )
                input_doc_paths.append(source)
            except FileNotFoundError:
                err_console.print(
@ -390,7 +402,7 @@ def convert(
        start_time = time.time()

        conv_results = doc_converter.convert_all(
-            input_doc_paths, raises_on_error=abort_on_error
+            input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
        )

        output.mkdir(parents=True, exist_ok=True)
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
 class _DocumentConversionInput(BaseModel):

    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
+    headers: Optional[Dict[str, str]] = None
    limits: Optional[DocumentLimits] = DocumentLimits()

    def docs(
        self, format_options: Dict[InputFormat, "FormatOption"]
    ) -> Iterable[InputDocument]:
        for item in self.path_or_stream_iterator:
-            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
+            obj = (
+                resolve_source_to_stream(item, self.headers)
+                if isinstance(item, str)
+                else item
+            )
            format = self._guess_format(obj)
            backend: Type[AbstractDocumentBackend]
            if format not in format_options.keys():
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -176,6 +176,7 @@ class DocumentConverter:
    def convert(
        self,
        source: Union[Path, str, DocumentStream],  # TODO review naming
+        headers: Optional[Dict[str, str]] = None,
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
@ -185,6 +186,7 @@ class DocumentConverter:
            raises_on_error=raises_on_error,
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
+            headers=headers,
        )
        return next(all_res)

@ -192,6 +194,7 @@ class DocumentConverter:
    def convert_all(
        self,
        source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
+        headers: Optional[Dict[str, str]] = None,
        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
@ -201,8 +204,7 @@ class DocumentConverter:
            max_file_size=max_file_size,
        )
        conv_input = _DocumentConversionInput(
-            path_or_stream_iterator=source,
-            limits=limits,
+            path_or_stream_iterator=source, limits=limits, headers=headers
        )
        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)