Commit 7c4a1489 authored by jr2804's avatar jr2804
Browse files

feat(crawler, database, tests): enhance date parsing and add tests

* Update DATE_PATTERN to handle various Unicode dash/hyphen characters.
* Modify MeetingCrawler to parse start and end dates from separate columns.
* Introduce _parse_single_date method for single date parsing.
* Adjust TDocDatabase schema for meeting_id as primary key.
* Add tests for MeetingCrawler's date parsing functionality.
* Create a new workspace configuration for debugging.
parent 09866dfa
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -215,3 +215,4 @@ __marimo__/

# secrets
/.env
/tests/test-cache
 No newline at end of file
+39 −7
Original line number Diff line number Diff line
@@ -32,7 +32,7 @@ EXCLUDED_DIRS_NORMALIZED = {entry.upper() for entry in EXCLUDED_DIRS}
FTP_HOST = "ftp.3gpp.org"
MEETINGS_BASE_URL = "https://www.3gpp.org/dynareport?code=Meetings-{code}.htm"
PORTAL_BASE_URL = "https://portal.3gpp.org"
DATE_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})")
DATE_PATTERN = re.compile(r"(\d{4}[\-\u2010-\u2015]\d{2}[\-\u2010-\u2015]\d{2})")

MEETING_CODE_REGISTRY: dict[WorkingGroup, list[tuple[str, str | None]]] = {
    WorkingGroup.RAN: [
@@ -445,10 +445,16 @@ class MeetingCrawler:
    ) -> MeetingMetadata:
        short_name, portal_url, meeting_id = self._extract_meeting_reference(cells[0])
        title = cells[1].get_text(strip=True) if len(cells) > 1 else None
        date_text = cells[2].get_text(" ", strip=True) if len(cells) > 2 else ""
        start_date, end_date = self._parse_dates(date_text)
        location = cells[3].get_text(" ", strip=True) if len(cells) > 3 else None
        files_url = self._extract_first_link(cells[-1])

        start_date_text = cells[3].get_text(" ", strip=True) if len(cells) >= 4 else ""
        end_date_text = cells[4].get_text(" ", strip=True) if len(cells) >= 5 else start_date_text

        start_date = self._parse_single_date(start_date_text)
        end_date = self._parse_single_date(end_date_text)

        location = cells[2].get_text(" ", strip=True) if len(cells) > 2 else "TBC"
        files_url = self._extract_first_link(cells[-3])

        return MeetingMetadata(
            meeting_id=meeting_id,
            working_group=working_group,
@@ -489,13 +495,39 @@ class MeetingCrawler:
        return href if href.startswith("http") else urljoin("https://www.3gpp.org", href)

    def _parse_dates(self, text: str) -> tuple[date | None, date | None]:
        """Parse start and end dates from text containing one or more dates.

        Handles various Unicode dash/hyphen characters by normalizing to ASCII.
        """
        matches = DATE_PATTERN.findall(text)
        if not matches:
            return None, None
        start = date.fromisoformat(matches[0])
        end = date.fromisoformat(matches[-1]) if len(matches) > 1 else start
        # Normalize Unicode dashes to ASCII hyphen-minus
        start_normalized = matches[0].translate(str.maketrans("\u2010\u2011\u2012\u2013\u2014\u2015", "------"))
        start = date.fromisoformat(start_normalized)
        if len(matches) > 1:
            end_normalized = matches[-1].translate(str.maketrans("\u2010\u2011\u2012\u2013\u2014\u2015", "------"))
            end = date.fromisoformat(end_normalized)
        else:
            end = start
        return start, end

    def _parse_single_date(self, text: str) -> date | None:
        """Parse a single date from text in YYYY-MM-DD format.

        Handles various Unicode dash/hyphen characters (U+002D, U+2010-U+2015)
        by normalizing them to ASCII hyphen-minus before parsing.
        """
        if not text:
            return None
        match = DATE_PATTERN.search(text)
        if match is None:
            return None
        # Normalize Unicode dashes to ASCII hyphen-minus for ISO format parsing
        date_str = match.group(1)
        normalized = date_str.translate(str.maketrans("\u2010\u2011\u2012\u2013\u2014\u2015", "------"))
        return date.fromisoformat(normalized)

    def _apply_limits(self, meetings: list[MeetingMetadata], limits: CrawlLimits) -> list[MeetingMetadata]:
        if not meetings:
            return []
+1 −2
Original line number Diff line number Diff line
@@ -113,8 +113,7 @@ class TDocDatabase:
        conn.execute(
            """
            CREATE TABLE IF NOT EXISTS meetings (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                meeting_id INTEGER NOT NULL UNIQUE,
                meeting_id INTEGER PRIMARY KEY  NOT NULL UNIQUE,
                working_group TEXT NOT NULL,
                subgroup TEXT,
                short_name TEXT NOT NULL,
+67 −0
Original line number Diff line number Diff line
{
	"folders": [
		{
			"path": "."
		}
	],
	"settings": {},
	"launch": {
		"version": "0.2.0",
		"configurations": [
			{
				"name": "Debug TDoc Crawler",
				"type": "debugpy",
				"request": "launch",
				"module": "tdoc_crawler",
				"console": "integratedTerminal",
				"justMyCode": false,
				"args": []
			},
			{
				"name": "Debug: crawl-meetings (max. 5)",
				"type": "debugpy",
				"request": "launch",
				"module": "tdoc_crawler",
				"console": "integratedTerminal",
				"justMyCode": false,
				"args": [
					"crawl-meetings",
					"--cache-dir",
					"./tests/test-cache",
					"--limit-meetings",
					"5"
				]
			},
			{
				"name": "Debug: crawl-meetings (full)",
				"type": "debugpy",
				"request": "launch",
				"module": "tdoc_crawler",
				"console": "integratedTerminal",
				"justMyCode": false,
				"args": [
					"crawl-meetings",
					"--cache-dir",
					"./tests/test-cache"
				]
			},
			{
				"name": "Debug: crawl-meetings (single WG)",
				"type": "debugpy",
				"request": "launch",
				"module": "tdoc_crawler",
				"console": "integratedTerminal",
				"justMyCode": false,
				"args": [
					"crawl-meetings",
					"--cache-dir",
					"./tests/test-cache",
					"--limit-wgs",
					"1",
					"--limit-meetings",
					"10"
				]
			}
		]
	}
}
 No newline at end of file
+57 −1
Original line number Diff line number Diff line
@@ -2,10 +2,13 @@

from __future__ import annotations

from datetime import date
from pathlib import Path
from unittest.mock import MagicMock, patch

from tdoc_crawler.crawler import EXCLUDED_DIRS, TDOC_PATTERN, TDocCrawler
from bs4 import BeautifulSoup

from tdoc_crawler.crawler import EXCLUDED_DIRS, TDOC_PATTERN, MeetingCrawler, TDocCrawler
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.models import QueryConfig, TDocCrawlConfig, WorkingGroup

@@ -177,6 +180,59 @@ class TestTDocCrawler:
            assert {record.tdoc_id for record in stored} == {"R1-2301234"}


class TestMeetingCrawler:
    """Tests for MeetingCrawler class."""

    def test_parse_single_date(self, test_db_path: Path) -> None:
        """Verify single date parsing from text."""

        with TDocDatabase(test_db_path) as database:
            crawler = MeetingCrawler(database)
            # Regular hyphen-minus (U+002D)
            assert crawler._parse_single_date("2025-10-20") == date(2025, 10, 20)
            assert crawler._parse_single_date("Meeting on 2025-10-20") == date(2025, 10, 20)
            # Non-breaking hyphen (U+2011) - intentionally testing Unicode characters
            assert crawler._parse_single_date("2027\u201109\u201120") == date(2027, 9, 20)
            # En dash (U+2013) - intentionally testing Unicode characters
            assert crawler._parse_single_date("2026\u201303\u201315") == date(2026, 3, 15)
            # Empty/invalid cases
            assert crawler._parse_single_date("") is None
            assert crawler._parse_single_date("No date here") is None

    def test_parse_meeting_row_with_separate_date_columns(self, test_db_path: Path) -> None:
        """Verify meeting row parsing with start and end dates in separate columns."""

        html = """
        <tr>
            <td><a href="https://portal.3gpp.org/Home.aspx#/meeting?MtgId=12345">SA4#134</a></td>
            <td>Meeting Title</td>
            <td>Paris, France</td>
            <td>2025-10-20</td>
            <td>2025-10-24</td>
            <td>Info</td>
            <td><a href="https://www.3gpp.org/ftp/tsg_sa/WG4_Codec/TSGS4_134/Docs/">Files</a></td>
            <td>Extra</td>
        </tr>
        """
        soup = BeautifulSoup(html, "lxml")
        row = soup.find("tr")
        assert row is not None
        cells = row.find_all("td")

        with TDocDatabase(test_db_path) as database:
            crawler = MeetingCrawler(database)
            meeting = crawler._parse_meeting_row(cells, WorkingGroup.SA, "SA4")

            assert meeting.meeting_id == 12345
            assert meeting.short_name == "SA4#134"
            assert meeting.title == "Meeting Title"
            assert meeting.start_date == date(2025, 10, 20)
            assert meeting.end_date == date(2025, 10, 24)
            assert meeting.location == "Paris, France"
            assert meeting.working_group == WorkingGroup.SA
            assert meeting.subgroup == "SA4"


if __name__ == "__main__":
    pass
# end of file