Commit edb6d833 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(rag): update insert method to support file_paths for citation tracking

* Modify TDocRAG.insert to accept file_paths for citation tracking.
* Update DocumentProcessor to insert metadata with file_paths.
* Adjust test to validate passing of file_paths to LightRAG ainsert.
* Remove outdated test for metadata kwargs handling.
parent 2cb28016
Loading
Loading
Loading
Loading
+2 −26
Original line number Diff line number Diff line
@@ -167,33 +167,9 @@ class DocumentProcessor:
                text = "---\n" + "\n".join(meta_lines) + "\n---\n\n" + text
                logger.debug("Enriched %s with generic metadata", file_path.name)

        # Insert into LightRAG
        # Insert into LightRAG — file_paths enables citation tracking in the graph
        try:
            insert_metadata: dict[str, object] = {
                "element_type": "document",
                "table_count": extraction.table_count,
                "figure_count": extraction.figure_count,
                "equation_count": extraction.equation_count,
                "table_ids": [table.element_id for table in extraction.tables],
                "figure_ids": [figure.element_id for figure in extraction.figures],
                "equation_ids": [equation.element_id for equation in extraction.equations],
            }

            # Add 3GPP-specific metadata if RAGMetadata provided
            if isinstance(metadata, RAGMetadata):
                insert_metadata.update(
                    {
                        "source_doc": metadata.tdoc_id,
                        "doc_type": "tdoc",
                        "meeting": metadata.meeting,
                        "wg": metadata.wg,
                    }
                )
            elif isinstance(metadata, dict):
                # Store generic metadata as-is
                insert_metadata["source_metadata"] = metadata

            await self.rag.insert(text, metadata=insert_metadata)
            await self.rag.insert(text, file_paths=[str(file_path)])

            # Track document in workspace index if shared storage is enabled
            if self.rag.workspace_index is not None and isinstance(metadata, RAGMetadata):
+8 −12
Original line number Diff line number Diff line
@@ -286,25 +286,21 @@ class TDocRAG:
            self._pg0_manager.stop()
            self._pg0_manager = None

    async def insert(self, text: str, **kwargs: Any) -> None:
    async def insert(
        self,
        text: str,
        file_paths: list[str] | None = None,
    ) -> None:
        """Insert document text into the knowledge graph.

        Args:
            text: Document text to insert
            **kwargs: Additional arguments passed to rag.ainsert()
            text: Document text to insert.
            file_paths: Optional file paths for citation tracking in the graph.
        """
        if not self._rag:
            raise RuntimeError("TDocRAG not started. Call start() first.")

        try:
            await self._rag.ainsert(text, **kwargs)
        except TypeError as exc:
            message = str(exc)
            if "unexpected keyword argument" in message and kwargs:
                logger.warning("LightRAG ainsert does not accept kwargs in this version; retrying without kwargs")
                await self._rag.ainsert(text)
                return
            raise
        await self._rag.ainsert(text, file_paths=file_paths)

    async def query(
        self,
+5 −27
Original line number Diff line number Diff line
@@ -158,25 +158,25 @@ class TestCreateMetadataFromDict:


@pytest.mark.asyncio
async def test_rag_insert_passes_metadata_kwargs() -> None:
    """TDocRAG.insert should pass metadata kwargs through to LightRAG ainsert."""
async def test_rag_insert_passes_file_paths() -> None:
    """TDocRAG.insert should pass file_paths through to LightRAG ainsert."""

    class _FakeRag:
        def __init__(self) -> None:
            self.called_with: tuple[str, dict] | None = None

        async def ainsert(self, text: str, **kwargs: dict) -> None:
        async def ainsert(self, text: str, **kwargs: object) -> None:
            self.called_with = (text, kwargs)

    rag = TDocRAG(LightRAGConfig())
    fake = _FakeRag()
    rag._rag = fake  # type: ignore[assignment]

    await rag.insert("sample", metadata={"element_type": "table", "page": 2})
    await rag.insert("sample", file_paths=["/path/to/doc.docx"])

    assert fake.called_with is not None
    assert fake.called_with[0] == "sample"
    assert fake.called_with[1]["metadata"]["element_type"] == "table"
    assert fake.called_with[1]["file_paths"] == ["/path/to/doc.docx"]


@pytest.mark.asyncio
@@ -216,25 +216,3 @@ async def test_rag_query_empty_workspace_returns_empty_result() -> None:
    result = await rag.query("any", mode="naive")

    assert result == ""


@pytest.mark.asyncio
async def test_rag_insert_retries_without_kwargs_for_older_lightrag() -> None:
    """TDocRAG.insert retries without kwargs when LightRAG rejects metadata args."""

    class _LegacyRag:
        def __init__(self) -> None:
            self.calls = 0

        async def ainsert(self, text: str, **kwargs: dict) -> None:
            self.calls += 1
            if kwargs:
                raise TypeError("LightRAG.ainsert() got an unexpected keyword argument 'metadata'")

    rag = TDocRAG(LightRAGConfig())
    legacy = _LegacyRag()
    rag._rag = legacy  # type: ignore[assignment]

    await rag.insert("sample", metadata={"element_type": "table"})

    assert legacy.calls == 2