feat: added http header support for document converter and cli (#642)

* added http header support for document converter and cli

Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com>

* fixed formatting and typing issues

Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com>

* use pydantic to parse dict

suggested by @dolfim-ibm

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Luke Harrison <luke.harrison1@ibm.com>

---------

Signed-off-by: Luke Harrison <Luke.Harrison1@ibm.com>
Signed-off-by: Luke Harrison <luke.harrison1@ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
Luke Harrison
2025-01-07 04:15:14 -05:00
committed by GitHub
parent 569038df42
commit 0ee849e8bc
3 changed files with 24 additions and 5 deletions

View File

@@ -164,6 +164,11 @@ def convert(
to_formats: List[OutputFormat] = typer.Option(
None, "--to", help="Specify output formats. Defaults to Markdown."
),
headers: str = typer.Option(
None,
"--headers",
help="Specify http request headers used when fetching url input sources in the form of a JSON string",
),
image_export_mode: Annotated[
ImageRefMode,
typer.Option(
@@ -279,12 +284,19 @@ def convert(
if from_formats is None:
from_formats = [e for e in InputFormat]
parsed_headers: Optional[Dict[str, str]] = None
if headers is not None:
headers_t = TypeAdapter(Dict[str, str])
parsed_headers = headers_t.validate_json(headers)
with tempfile.TemporaryDirectory() as tempdir:
input_doc_paths: List[Path] = []
for src in input_sources:
try:
# check if we can fetch some remote url
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
source = resolve_source_to_path(
source=src, headers=parsed_headers, workdir=Path(tempdir)
)
input_doc_paths.append(source)
except FileNotFoundError:
err_console.print(
@@ -390,7 +402,7 @@ def convert(
start_time = time.time()
conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
)
output.mkdir(parents=True, exist_ok=True)