Loading src/tdoc_crawler/meetings/operations/crawl.py +7 −6 Original line number Diff line number Diff line Loading @@ -191,16 +191,17 @@ class MeetingCrawler: """Limit number of meetings per sub-working group.""" if limit is None or limit == 0: return meetings order: dict[WorkingGroup, list[int]] = defaultdict(list) order: dict[int, list[int]] = defaultdict(list) for meeting in meetings: sequence = order[meeting.working_group] wg_id = meeting.subtb or meeting.tbid sequence = order[wg_id] if meeting.meeting_id not in sequence: sequence.append(meeting.meeting_id) allowed_ids: dict[WorkingGroup, set[int]] = {} for working_group, sequence in order.items(): allowed_ids: dict[int, set[int]] = {} for wg_id, sequence in order.items(): selected = sequence[:limit] if limit > 0 else sequence[limit:] allowed_ids[working_group] = set(selected) return [meeting for meeting in meetings if meeting.meeting_id in allowed_ids.get(meeting.working_group, {meeting.meeting_id})] allowed_ids[wg_id] = set(selected) return [meeting for meeting in meetings if meeting.meeting_id in allowed_ids.get(meeting.subtb or meeting.tbid, {meeting.meeting_id})] @staticmethod def _limit_meetings( Loading src/tdoc_crawler/tdocs/operations/crawl.py +8 −9 Original line number Diff line number Diff line Loading @@ -17,7 +17,6 @@ from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig from tdoc_crawler.models.base import SortOrder from tdoc_crawler.models.crawl_limits import CrawlLimits from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocMetadata from tdoc_crawler.utils.normalization import normalize_tdoc_id from tdoc_crawler.workers.tdoc_worker import fetch_meeting_document_list_subinterpreter Loading Loading @@ -306,14 +305,14 @@ class TDocCrawler: return meetings max_per_subwg = abs(per_subwg_limit) per_subwg_counts: dict[WorkingGroup, int] = {} per_subwg_counts: dict[int, int] = {} filtered: list[MeetingMetadata] = [] for meeting in meetings: working_group = meeting.working_group count = per_subwg_counts.get(working_group, 0) wg_id = meeting.subtb or meeting.tbid count = per_subwg_counts.get(wg_id, 0) if count >= max_per_subwg: continue per_subwg_counts[working_group] = count + 1 per_subwg_counts[wg_id] = count + 1 filtered.append(meeting) return filtered Loading @@ -324,16 +323,16 @@ class TDocCrawler: return meetings max_groups = abs(limit_subwgs) seen_groups: set[WorkingGroup] = set() seen_groups: set[int] = set() filtered: list[MeetingMetadata] = [] for meeting in meetings: working_group = meeting.working_group if working_group in seen_groups: wg_id = meeting.subtb or meeting.tbid if wg_id in seen_groups: filtered.append(meeting) continue if len(seen_groups) >= max_groups: continue seen_groups.add(working_group) seen_groups.add(wg_id) filtered.append(meeting) return filtered Loading Loading
src/tdoc_crawler/meetings/operations/crawl.py +7 −6 Original line number Diff line number Diff line Loading @@ -191,16 +191,17 @@ class MeetingCrawler: """Limit number of meetings per sub-working group.""" if limit is None or limit == 0: return meetings order: dict[WorkingGroup, list[int]] = defaultdict(list) order: dict[int, list[int]] = defaultdict(list) for meeting in meetings: sequence = order[meeting.working_group] wg_id = meeting.subtb or meeting.tbid sequence = order[wg_id] if meeting.meeting_id not in sequence: sequence.append(meeting.meeting_id) allowed_ids: dict[WorkingGroup, set[int]] = {} for working_group, sequence in order.items(): allowed_ids: dict[int, set[int]] = {} for wg_id, sequence in order.items(): selected = sequence[:limit] if limit > 0 else sequence[limit:] allowed_ids[working_group] = set(selected) return [meeting for meeting in meetings if meeting.meeting_id in allowed_ids.get(meeting.working_group, {meeting.meeting_id})] allowed_ids[wg_id] = set(selected) return [meeting for meeting in meetings if meeting.meeting_id in allowed_ids.get(meeting.subtb or meeting.tbid, {meeting.meeting_id})] @staticmethod def _limit_meetings( Loading
src/tdoc_crawler/tdocs/operations/crawl.py +8 −9 Original line number Diff line number Diff line Loading @@ -17,7 +17,6 @@ from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig from tdoc_crawler.models.base import SortOrder from tdoc_crawler.models.crawl_limits import CrawlLimits from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocMetadata from tdoc_crawler.utils.normalization import normalize_tdoc_id from tdoc_crawler.workers.tdoc_worker import fetch_meeting_document_list_subinterpreter Loading Loading @@ -306,14 +305,14 @@ class TDocCrawler: return meetings max_per_subwg = abs(per_subwg_limit) per_subwg_counts: dict[WorkingGroup, int] = {} per_subwg_counts: dict[int, int] = {} filtered: list[MeetingMetadata] = [] for meeting in meetings: working_group = meeting.working_group count = per_subwg_counts.get(working_group, 0) wg_id = meeting.subtb or meeting.tbid count = per_subwg_counts.get(wg_id, 0) if count >= max_per_subwg: continue per_subwg_counts[working_group] = count + 1 per_subwg_counts[wg_id] = count + 1 filtered.append(meeting) return filtered Loading @@ -324,16 +323,16 @@ class TDocCrawler: return meetings max_groups = abs(limit_subwgs) seen_groups: set[WorkingGroup] = set() seen_groups: set[int] = set() filtered: list[MeetingMetadata] = [] for meeting in meetings: working_group = meeting.working_group if working_group in seen_groups: wg_id = meeting.subtb or meeting.tbid if wg_id in seen_groups: filtered.append(meeting) continue if len(seen_groups) >= max_groups: continue seen_groups.add(working_group) seen_groups.add(wg_id) filtered.append(meeting) return filtered Loading