Add file-based document store and vector store (#96)

* Modify docstore and vectorstore objects to be reconstructable
* Simplify the file docstore
* Use the simple file docstore and vector store in MVP
This commit is contained in:
Duc Nguyen (john)
2023-12-04 17:46:00 +07:00
committed by GitHub
parent 0ce3a8832f
commit 37c744b616
18 changed files with 324 additions and 149 deletions

View File

@@ -1,5 +1,11 @@
from .base import BaseDocumentStore
from .elasticsearch import ElasticsearchDocumentStore
from .in_memory import InMemoryDocumentStore
from .simple_file import SimpleFileDocumentStore
__all__ = ["BaseDocumentStore", "InMemoryDocumentStore", "ElasticsearchDocumentStore"]
__all__ = [
"BaseDocumentStore",
"InMemoryDocumentStore",
"ElasticsearchDocumentStore",
"SimpleFileDocumentStore",
]

View File

@@ -1,8 +1,7 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Optional, Union
from ...base import Document
from kotaemon.base import Document
class BaseDocumentStore(ABC):
@@ -46,13 +45,3 @@ class BaseDocumentStore(ABC):
def delete(self, ids: Union[List[str], str]):
"""Delete document by id"""
...
@abstractmethod
def save(self, path: Union[str, Path]):
"""Save document to path"""
...
@abstractmethod
def load(self, path: Union[str, Path]):
"""Load document store from path"""
...

View File

@@ -1,7 +1,7 @@
from pathlib import Path
from typing import List, Optional, Union
from ...base import Document
from kotaemon.base import Document
from .base import BaseDocumentStore
MAX_DOCS_TO_GET = 10**4
@@ -27,6 +27,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.elasticsearch_url = elasticsearch_url
self.index_name = index_name
self.k1 = k1
self.b = b
# Create an Elasticsearch client instance
self.client = Elasticsearch(elasticsearch_url)
@@ -160,10 +162,10 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.client.delete_by_query(index=self.index_name, body=query)
self.client.indices.refresh(index=self.index_name)
def save(self, path: Union[str, Path]):
"""Save document to path"""
# not required for ElasticDocstore
def load(self, path: Union[str, Path]):
"""Load document store from path"""
# not required for ElasticDocstore
def __persist_flow__(self):
return {
"index_name": self.index_name,
"elasticsearch_url": self.elasticsearch_url,
"k1": self.k1,
"b": self.b,
}

View File

@@ -2,7 +2,8 @@ import json
from pathlib import Path
from typing import List, Optional, Union
from ...base import Document
from kotaemon.base import Document
from .base import BaseDocumentStore
@@ -74,3 +75,6 @@ class InMemoryDocumentStore(BaseDocumentStore):
with open(path) as f:
store = json.load(f)
self._store = {key: Document.from_dict(value) for key, value in store.items()}
def __persist_flow__(self):
return {}

View File

@@ -0,0 +1,44 @@
from pathlib import Path
from typing import List, Optional, Union
from kotaemon.base import Document
from .in_memory import InMemoryDocumentStore
class SimpleFileDocumentStore(InMemoryDocumentStore):
"""Improve InMemoryDocumentStore by auto saving whenever the corpus is changed"""
def __init__(self, path: str | Path):
super().__init__()
self._path = path
if path is not None and Path(path).is_file():
self.load(path)
def add(
self,
docs: Union[Document, List[Document]],
ids: Optional[Union[List[str], str]] = None,
**kwargs,
):
"""Add document into document store
Args:
docs: list of documents to add
ids: specify the ids of documents to add or
use existing doc.doc_id
exist_ok: raise error when duplicate doc-id
found in the docstore (default to False)
"""
super().add(docs=docs, ids=ids, **kwargs)
self.save(self._path)
def delete(self, ids: Union[List[str], str]):
"""Delete document by id"""
super().delete(ids=ids)
self.save(self._path)
def __persist_flow__(self):
from theflow.utils.modules import serialize
return {"path": serialize(self._path)}