feat: added http header support for document converter and cli (#642)

* added http header support for document converter and cli

Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com>

* fixed formatting and typing issues

Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com>

* use pydantic to parse dict

suggested by @dolfim-ibm

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Luke Harrison <luke.harrison1@ibm.com>

---------

Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com>
Signed-off-by: Luke Harrison <luke.harrison1@ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
Luke Harrison 2025-01-07 04:15:14 -05:00 committed by GitHub
parent 569038df42
commit 0ee849e8bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 24 additions and 5 deletions

View File

@ -164,6 +164,11 @@ def convert(
to_formats: List[OutputFormat] = typer.Option( to_formats: List[OutputFormat] = typer.Option(
None, "--to", help="Specify output formats. Defaults to Markdown." None, "--to", help="Specify output formats. Defaults to Markdown."
), ),
headers: str = typer.Option(
None,
"--headers",
help="Specify http request headers used when fetching url input sources in the form of a JSON string",
),
image_export_mode: Annotated[ image_export_mode: Annotated[
ImageRefMode, ImageRefMode,
typer.Option( typer.Option(
@ -279,12 +284,19 @@ def convert(
if from_formats is None: if from_formats is None:
from_formats = [e for e in InputFormat] from_formats = [e for e in InputFormat]
parsed_headers: Optional[Dict[str, str]] = None
if headers is not None:
headers_t = TypeAdapter(Dict[str, str])
parsed_headers = headers_t.validate_json(headers)
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
input_doc_paths: List[Path] = [] input_doc_paths: List[Path] = []
for src in input_sources: for src in input_sources:
try: try:
# check if we can fetch some remote url # check if we can fetch some remote url
source = resolve_source_to_path(source=src, workdir=Path(tempdir)) source = resolve_source_to_path(
source=src, headers=parsed_headers, workdir=Path(tempdir)
)
input_doc_paths.append(source) input_doc_paths.append(source)
except FileNotFoundError: except FileNotFoundError:
err_console.print( err_console.print(
@ -390,7 +402,7 @@ def convert(
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_all( conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
) )
output.mkdir(parents=True, exist_ok=True) output.mkdir(parents=True, exist_ok=True)

View File

@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
class _DocumentConversionInput(BaseModel): class _DocumentConversionInput(BaseModel):
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
headers: Optional[Dict[str, str]] = None
limits: Optional[DocumentLimits] = DocumentLimits() limits: Optional[DocumentLimits] = DocumentLimits()
def docs( def docs(
self, format_options: Dict[InputFormat, "FormatOption"] self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]: ) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator: for item in self.path_or_stream_iterator:
obj = resolve_source_to_stream(item) if isinstance(item, str) else item obj = (
resolve_source_to_stream(item, self.headers)
if isinstance(item, str)
else item
)
format = self._guess_format(obj) format = self._guess_format(obj)
backend: Type[AbstractDocumentBackend] backend: Type[AbstractDocumentBackend]
if format not in format_options.keys(): if format not in format_options.keys():

View File

@ -176,6 +176,7 @@ class DocumentConverter:
def convert( def convert(
self, self,
source: Union[Path, str, DocumentStream], # TODO review naming source: Union[Path, str, DocumentStream], # TODO review naming
headers: Optional[Dict[str, str]] = None,
raises_on_error: bool = True, raises_on_error: bool = True,
max_num_pages: int = sys.maxsize, max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize, max_file_size: int = sys.maxsize,
@ -185,6 +186,7 @@ class DocumentConverter:
raises_on_error=raises_on_error, raises_on_error=raises_on_error,
max_num_pages=max_num_pages, max_num_pages=max_num_pages,
max_file_size=max_file_size, max_file_size=max_file_size,
headers=headers,
) )
return next(all_res) return next(all_res)
@ -192,6 +194,7 @@ class DocumentConverter:
def convert_all( def convert_all(
self, self,
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
headers: Optional[Dict[str, str]] = None,
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize, max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize, max_file_size: int = sys.maxsize,
@ -201,8 +204,7 @@ class DocumentConverter:
max_file_size=max_file_size, max_file_size=max_file_size,
) )
conv_input = _DocumentConversionInput( conv_input = _DocumentConversionInput(
path_or_stream_iterator=source, path_or_stream_iterator=source, limits=limits, headers=headers
limits=limits,
) )
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error) conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)