Commit f95f9d2f authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(meetings): add option to include meetings without associated files

- Introduced `include_without_files` option in MeetingCrawlConfig.
- Updated crawl logic to filter meetings based on the new option.
parent da66bf5a
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ from tdoc_crawler.cli.args import (
    EolPasswordOption,
    EolUsernameOption,
    HttpCacheOption,
    IncludeWithoutFilesOption,
    IncrementalOption,
    LimitMeetingsOption,
    LimitMeetingsPerWgOption,
@@ -188,6 +189,7 @@ def crawl_meetings(
    limit_wgs: LimitWgsOption = None,
    checkout: CheckoutOption = False,
    incremental: IncrementalOption = True,
    include_without_files: IncludeWithoutFilesOption = False,
    clear_db: ClearDbOption = False,
    clear_tdocs: ClearTDocsOption = False,
    clear_specs: ClearSpecsOption = False,
@@ -213,6 +215,7 @@ def crawl_meetings(
        working_groups=working_groups,
        subgroups=subgroups,
        incremental=incremental,
        include_without_files=include_without_files,
        max_retries=max_retries,
        timeout=timeout,
        limits=limits,
+4 −0
Original line number Diff line number Diff line
@@ -102,6 +102,10 @@ class MeetingCrawlConfig(BaseConfigModel):
        True,
        description=("When true, skip meetings already stored in the database (fetch only new meetings)."),
    )
    include_without_files: bool = Field(
        False,
        description="Include meetings without associated files URL (e.g., future meetings)",
    )
    max_retries: int = Field(3, ge=0, description="Max retry attempts")
    timeout: int = Field(30, gt=0, description="HTTP timeout in seconds")
    limits: CrawlLimits = Field(default_factory=_new_crawl_limits, description="Crawl limit parameters")
+3 −0
Original line number Diff line number Diff line
@@ -100,6 +100,9 @@ class MeetingCrawler:
                    for meeting in parsed_meetings:
                        if config.incremental and meeting.meeting_id in existing_ids:
                            continue
                        # Filter out meetings without files_url unless explicitly included
                        if not config.include_without_files and not meeting.files_url:
                            continue
                        meetings.append(meeting)
        finally:
            session.close()