Commit 0f10709b authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(extraction): enhance hybrid server configuration and processing

* Update hybrid server config to include enrich_formula and enrich_picture options.
* Modify ensure_hybrid_server_for_profile to utilize new config.
* Adjust convert_for_wiki to conditionally set hybrid parameters based on profile.
* Change demo.bat to use default profile for workspace processing.
parent 9a530af8
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -39,4 +39,5 @@ tdoc-crawler query --agenda "*atias*" --start-date 2018
3gpp-crawler workspace members

:: convert tdocs/specs to PDF/artefacts for AI processing (portable fallback profile)
3gpp-crawler workspace process --profile pdf-only
:: 3gpp-crawler workspace process --profile pdf-only
3gpp-crawler workspace process --profile default
+7 −3
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ from pathlib import Path
from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TaskID, TextColumn

from tdoc_crawler.database.base import DocDatabase
from tdoc_crawler.extraction.hybrid_server import ensure_hybrid_server
from tdoc_crawler.extraction.hybrid_server import HybridServerConfig, ensure_hybrid_server
from tdoc_crawler.extraction.profiles import ExtractionProfile
from tdoc_crawler.logging import get_console
from tdoc_crawler.specs.operations.checkout import clear_checkout_specs
@@ -86,11 +86,15 @@ def create_progress_bar(description: str, total: float = 100) -> tuple[Progress,

def ensure_hybrid_server_for_profile(profile: ExtractionProfile) -> bool:
    """Ensure hybrid server is available for extraction profiles that require it."""
    if profile == ExtractionProfile.PDF_ONLY:
    if profile != ExtractionProfile.ADVANCED:
        return True

    config = HybridServerConfig(
        enrich_formula=True,
        enrich_picture=True,
    )
    console.print(f"[dim]Ensuring hybrid server for profile '{profile.value}'...[/dim]")
    _, status = ensure_hybrid_server(auto_start=True, progress_callback=lambda message: console.print(f"[dim]{message}[/dim]"))
    _, status = ensure_hybrid_server(config=config, auto_start=True, progress_callback=lambda message: console.print(f"[dim]{message}[/dim]"))
    if status.running:
        return True

+1 −1
Original line number Diff line number Diff line
@@ -263,7 +263,7 @@ def convert_for_wiki(
    pdf_path = ensure_pdf(primary, wiki_source_dir, force=force)

    config = OpendataloaderConfig(
        hybrid="docling-fast",
        hybrid="docling-fast" if profile == ExtractionProfile.ADVANCED else None,
        hybrid_mode="full" if profile == ExtractionProfile.ADVANCED else None,
    )
    markdown_content, json_path, _ = _ensure_converted(
+6 −2
Original line number Diff line number Diff line
@@ -38,6 +38,8 @@ class HybridServerConfig:
    port: int = DEFAULT_PORT
    device: str = "auto"
    log_level: str = "info"
    enrich_formula: bool = False
    enrich_picture: bool = False


@dataclass
@@ -145,9 +147,11 @@ class HybridServerManager:
            "--log-level",
            self.config.log_level,
            "--force-ocr",
            "--enrich-formula",
            "--enrich-picture-description",
        ]
        if self.config.enrich_formula:
            cmd.append("--enrich-formula")
        if self.config.enrich_picture:
            cmd.append("--enrich-picture-description")

        try:
            log_path = self.log_file