From 2a3a23ecd789210846fe30bafd98f02fb7f69839 Mon Sep 17 00:00:00 2001 From: "Nguyen Trung Duc (john)" Date: Tue, 19 Sep 2023 14:49:23 +0700 Subject: [PATCH] [AUR-420] Provide document store base interface and an in-memory version (#21) Document store handles storing and indexing Documents. It supports the following interfaces: - add: add 1 or more documents into document store - get: get a list of documents - get_all: get all documents in a document store - delete: delete 1 or more document - save: persist a document store into disk - load: load a document store from disk --- knowledgehub/docstores/__init__.py | 4 ++ knowledgehub/docstores/base.py | 54 ++++++++++++++++++++++++ knowledgehub/docstores/simple.py | 68 ++++++++++++++++++++++++++++++ knowledgehub/embeddings/base.py | 2 +- tests/test_docstores.py | 58 +++++++++++++++++++++++++ 5 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 knowledgehub/docstores/__init__.py create mode 100644 knowledgehub/docstores/base.py create mode 100644 knowledgehub/docstores/simple.py create mode 100644 tests/test_docstores.py diff --git a/knowledgehub/docstores/__init__.py b/knowledgehub/docstores/__init__.py new file mode 100644 index 0000000..88f6829 --- /dev/null +++ b/knowledgehub/docstores/__init__.py @@ -0,0 +1,4 @@ +from .base import BaseDocumentStore +from .simple import InMemoryDocumentStore + +__all__ = ["BaseDocumentStore", "InMemoryDocumentStore"] diff --git a/knowledgehub/docstores/base.py b/knowledgehub/docstores/base.py new file mode 100644 index 0000000..14eb7ea --- /dev/null +++ b/knowledgehub/docstores/base.py @@ -0,0 +1,54 @@ +from abc import ABC, abstractmethod +from pathlib import Path +from typing import List, Optional, Union + +from ..documents.base import Document + + +class BaseDocumentStore(ABC): + """A document store is in charged of storing and managing documents""" + + @abstractmethod + def __init__(self, *args, **kwargs): + ... + + @abstractmethod + def add( + self, + docs: Union[Document, List[Document]], + ids: Optional[Union[List[str], str]] = None, + exist_ok: bool = False, + ): + """Add document into document store + + Args: + docs: Document or list of documents + ids: List of ids of the documents. Optional, if not set will use doc.doc_id + exist_ok: If True, will not raise error if document already exist + """ + ... + + @abstractmethod + def get(self, ids: Union[List[str], str]) -> List[Document]: + """Get document by id""" + ... + + @abstractmethod + def get_all(self) -> dict: + """Get all documents""" + ... + + @abstractmethod + def delete(self, ids: Union[List[str], str]): + """Delete document by id""" + ... + + @abstractmethod + def save(self, path: Union[str, Path]): + """Save document to path""" + ... + + @abstractmethod + def load(self, path: Union[str, Path]): + """Load document store from path""" + ... diff --git a/knowledgehub/docstores/simple.py b/knowledgehub/docstores/simple.py new file mode 100644 index 0000000..7f812e8 --- /dev/null +++ b/knowledgehub/docstores/simple.py @@ -0,0 +1,68 @@ +import json +from pathlib import Path +from typing import List, Optional, Union + +from ..documents.base import Document +from .base import BaseDocumentStore + + +class InMemoryDocumentStore(BaseDocumentStore): + """Simple memory document store that store document in a dictionary""" + + def __init__(self): + self.store = {} + + def add( + self, + docs: Union[Document, List[Document]], + ids: Optional[Union[List[str], str]] = None, + exist_ok: bool = False, + ): + """Add document into document store + + Args: + docs: Union[Document, List[Document]], + ids: Optional[Union[List[str], str]] = None, + """ + doc_ids = ids if ids else [doc.doc_id for doc in docs] + if not isinstance(doc_ids, list): + doc_ids = [doc_ids] + + if not isinstance(docs, list): + docs = [docs] + + for doc_id, doc in zip(doc_ids, docs): + if doc_id in self.store and not exist_ok: + raise ValueError(f"Document with id {doc_id} already exist") + self.store[doc_id] = doc + + def get(self, ids: Union[List[str], str]) -> List[Document]: + """Get document by id""" + if not isinstance(ids, list): + ids = [ids] + + return [self.store[doc_id] for doc_id in ids] + + def get_all(self) -> dict: + """Get all documents""" + return self.store + + def delete(self, ids: Union[List[str], str]): + """Delete document by id""" + if not isinstance(ids, list): + ids = [ids] + + for doc_id in ids: + del self.store[doc_id] + + def save(self, path: Union[str, Path]): + """Save document to path""" + store = {key: value.to_dict() for key, value in self.store.items()} + with open(path, "w") as f: + json.dump(store, f) + + def load(self, path: Union[str, Path]): + """Load document store from path""" + with open(path) as f: + store = json.load(f) + self.store = {key: Document.from_dict(value) for key, value in store.items()} diff --git a/knowledgehub/embeddings/base.py b/knowledgehub/embeddings/base.py index c3b2034..2c261ea 100644 --- a/knowledgehub/embeddings/base.py +++ b/knowledgehub/embeddings/base.py @@ -1,7 +1,7 @@ from abc import abstractmethod from typing import List, Type -from langchain.embeddings.base import Embeddings as LCEmbeddings +from langchain.schema.embeddings import Embeddings as LCEmbeddings from theflow import Param from ..components import BaseComponent diff --git a/tests/test_docstores.py b/tests/test_docstores.py new file mode 100644 index 0000000..d218211 --- /dev/null +++ b/tests/test_docstores.py @@ -0,0 +1,58 @@ +import pytest + +from kotaemon.docstores import InMemoryDocumentStore +from kotaemon.documents.base import Document + + +def test_simple_document_store_base_interfaces(tmp_path): + """Test all interfaces of a a document store""" + + store = InMemoryDocumentStore() + docs = [ + Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"}) + for idx in range(10) + ] + + # Test add and get all + assert len(store.get_all()) == 0, "Document store should be empty" + store.add(docs) + assert len(store.get_all()) == 10, "Document store should have 10 documents" + + # Test add with provided ids + store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)]) + assert len(store.get_all()) == 20, "Document store should have 20 documents" + + # Test add without exist_ok + with pytest.raises(ValueError): + store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)]) + + # Update ok with add exist_ok + store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)], exist_ok=True) + assert len(store.get_all()) == 20, "Document store should have 20 documents" + + # Test get with str id + matched = store.get(docs[0].doc_id) + assert len(matched) == 1, "Should return 1 document" + assert matched[0].text == docs[0].text, "Should return the correct document" + + # Test get with list of ids + matched = store.get([docs[0].doc_id, docs[1].doc_id]) + assert len(matched) == 2, "Should return 2 documents" + assert [doc.text for doc in matched] == [doc.text for doc in docs[:2]] + + # Test delete with str id + store.delete(docs[0].doc_id) + assert len(store.get_all()) == 19, "Document store should have 19 documents" + + # Test delete with list of ids + store.delete([docs[1].doc_id, docs[2].doc_id]) + assert len(store.get_all()) == 17, "Document store should have 17 documents" + + # Test save + store.save(tmp_path / "store.json") + assert (tmp_path / "store.json").exists(), "File should exist" + + # Test load + store2 = InMemoryDocumentStore() + store2.load(tmp_path / "store.json") + assert len(store2.get_all()) == 17, "Laded document store should have 17 documents"