Restructure index to allow it to be dynamically created by end-user (#151)

1. Introduce the concept of "collection_name" to docstore and vector store. Each collection can be viewed similarly to a table in a SQL database. It allows better organizing information within this data source.
2. Move the `Index` and `Source` tables from the application scope into the index scope. For each new index created by user, these tables should increase accordingly. So it depends on the index, rather than the app.
3. Make each index responsible for the UI components in the app.
4. Construct the File UI page.
This commit is contained in:
Duc Nguyen (john)
2024-03-07 01:50:47 +07:00
committed by GitHub
parent cc87aaa783
commit 8a90fcfc99
43 changed files with 1658 additions and 812 deletions

View File

@@ -45,3 +45,8 @@ class BaseDocumentStore(ABC):
def delete(self, ids: Union[List[str], str]):
"""Delete document by id"""
...
@abstractmethod
def drop(self):
"""Drop the document store"""
...

View File

@@ -12,7 +12,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
def __init__(
self,
index_name: str = "docstore",
collection_name: str = "docstore",
elasticsearch_url: str = "http://localhost:9200",
k1: float = 2.0,
b: float = 0.75,
@@ -27,7 +27,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
)
self.elasticsearch_url = elasticsearch_url
self.index_name = index_name
self.index_name = collection_name
self.k1 = k1
self.b = b
@@ -55,9 +55,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
}
# Create the index with the specified settings and mappings
if not self.client.indices.exists(index=index_name):
if not self.client.indices.exists(index=self.index_name):
self.client.indices.create(
index=index_name, mappings=mappings, settings=settings
index=self.index_name, mappings=mappings, settings=settings
)
def add(
@@ -164,6 +164,11 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.client.delete_by_query(index=self.index_name, body=query)
self.client.indices.refresh(index=self.index_name)
def drop(self):
"""Drop the document store"""
self.client.indices.delete(index=self.index_name)
self.client.indices.refresh(index=self.index_name)
def __persist_flow__(self):
return {
"index_name": self.index_name,

View File

@@ -83,3 +83,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def __persist_flow__(self):
return {}
def drop(self):
"""Drop the document store"""
self._store = {}

View File

@@ -9,11 +9,15 @@ from .in_memory import InMemoryDocumentStore
class SimpleFileDocumentStore(InMemoryDocumentStore):
"""Improve InMemoryDocumentStore by auto saving whenever the corpus is changed"""
def __init__(self, path: str | Path):
def __init__(self, path: str | Path, collection_name: str = "default"):
super().__init__()
self._path = path
if path is not None and Path(path).is_file():
self.load(path)
self._collection_name = collection_name
Path(path).mkdir(parents=True, exist_ok=True)
self._save_path = Path(path) / f"{collection_name}.json"
if self._save_path.is_file():
self.load(self._save_path)
def get(self, ids: Union[List[str], str]) -> List[Document]:
"""Get document by id"""
@@ -22,7 +26,7 @@ class SimpleFileDocumentStore(InMemoryDocumentStore):
for doc_id in ids:
if doc_id not in self._store:
self.load(self._path)
self.load(self._save_path)
break
return [self._store[doc_id] for doc_id in ids]
@@ -43,14 +47,22 @@ class SimpleFileDocumentStore(InMemoryDocumentStore):
found in the docstore (default to False)
"""
super().add(docs=docs, ids=ids, **kwargs)
self.save(self._path)
self.save(self._save_path)
def delete(self, ids: Union[List[str], str]):
"""Delete document by id"""
super().delete(ids=ids)
self.save(self._path)
self.save(self._save_path)
def drop(self):
"""Drop the document store"""
super().drop()
self._save_path.unlink(missing_ok=True)
def __persist_flow__(self):
from theflow.utils.modules import serialize
return {"path": serialize(self._path)}
return {
"path": serialize(self._path),
"collection_name": self._collection_name,
}

View File

@@ -66,6 +66,11 @@ class BaseVectorStore(ABC):
"""
...
@abstractmethod
def drop(self):
"""Drop the vector store"""
...
class LlamaIndexVectorStore(BaseVectorStore):
_li_class: type[LIVectorStore | BasePydanticVectorStore]

View File

@@ -66,17 +66,9 @@ class ChromaVectorStore(LlamaIndexVectorStore):
"""
self._client.client.delete(ids=ids)
def delete_collection(self, collection_name: Optional[str] = None):
"""Delete entire collection under specified name from vector stores
Args:
collection_name: Name of the collection to delete
"""
# a rather ugly chain call but it do the job of finding
# original chromadb client and call delete_collection() method
if collection_name is None:
collection_name = self._client.client.name
self._client.client._client.delete_collection(collection_name)
def drop(self):
"""Delete entire collection from vector stores"""
self._client.client._client.delete_collection(self._client.client.name)
def count(self) -> int:
return self._collection.count()

View File

@@ -53,6 +53,10 @@ class InMemoryVectorStore(LlamaIndexVectorStore):
"""
self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)
def drop(self):
"""Clear the old data"""
self._data = SimpleVectorStoreData()
def __persist_flow__(self):
d = self._data.to_dict()
d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"

View File

@@ -20,6 +20,7 @@ class SimpleFileVectorStore(LlamaIndexVectorStore):
def __init__(
self,
path: str | Path,
collection_name: str = "default",
data: Optional[SimpleVectorStoreData] = None,
fs: Optional[fsspec.AbstractFileSystem] = None,
**kwargs: Any,
@@ -27,8 +28,9 @@ class SimpleFileVectorStore(LlamaIndexVectorStore):
"""Initialize params."""
self._data = data or SimpleVectorStoreData()
self._fs = fs or fsspec.filesystem("file")
self._collection_name = collection_name
self._path = path
self._save_path = Path(path)
self._save_path = Path(path) / collection_name
super().__init__(
data=data,
@@ -56,11 +58,16 @@ class SimpleFileVectorStore(LlamaIndexVectorStore):
self._client.persist(str(self._save_path), self._fs)
return r
def drop(self):
self._data = SimpleVectorStoreData()
self._save_path.unlink(missing_ok=True)
def __persist_flow__(self):
d = self._data.to_dict()
d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
return {
"data": d,
"collection_name": self._collection_name,
"path": str(self._path),
# "fs": self._fs,
}

View File

@@ -271,9 +271,7 @@ def test_inmemory_document_store_base_interfaces(tmp_path):
def test_simplefile_document_store_base_interfaces(tmp_path):
"""Test all interfaces of a a document store"""
path = tmp_path / "store.json"
store = SimpleFileDocumentStore(path=path)
store = SimpleFileDocumentStore(path=tmp_path)
docs = [
Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"})
for idx in range(10)
@@ -315,13 +313,13 @@ def test_simplefile_document_store_base_interfaces(tmp_path):
assert len(store.get_all()) == 17, "Document store should have 17 documents"
# Test save
assert path.exists(), "File should exist"
assert (tmp_path / "default.json").exists(), "File should exist"
# Test load
store2 = SimpleFileDocumentStore(path=path)
store2 = SimpleFileDocumentStore(path=tmp_path)
assert len(store2.get_all()) == 17, "Laded document store should have 17 documents"
os.remove(path)
os.remove(tmp_path / "default.json")
@patch(
@@ -329,7 +327,7 @@ def test_simplefile_document_store_base_interfaces(tmp_path):
side_effect=_elastic_search_responses,
)
def test_elastic_document_store(elastic_api):
store = ElasticsearchDocumentStore(index_name="test")
store = ElasticsearchDocumentStore(collection_name="test")
docs = [
Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"})

View File

@@ -81,7 +81,7 @@ class TestChromaVectorStore:
), "load function does not load data completely"
# test delete collection function
db2.delete_collection()
db2.drop()
# reinit the chroma with the same collection name
db2 = ChromaVectorStore(path=str(tmp_path))
assert (
@@ -133,10 +133,11 @@ class TestSimpleFileVectorStore:
embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]
metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}]
ids = ["1", "2", "3"]
db = SimpleFileVectorStore(path=tmp_path / "test_save_load_delete.json")
collection_name = "test_save_load_delete"
db = SimpleFileVectorStore(path=tmp_path, collection_name=collection_name)
db.add(embeddings=embeddings, metadatas=metadatas, ids=ids)
db.delete(["3"])
with open(tmp_path / "test_save_load_delete.json") as f:
with open(tmp_path / collection_name) as f:
data = json.load(f)
assert (
"1" and "2" in data["text_id_to_ref_doc_id"]
@@ -144,11 +145,11 @@ class TestSimpleFileVectorStore:
assert (
"3" not in data["text_id_to_ref_doc_id"]
), "delete function does not delete data completely"
db2 = SimpleFileVectorStore(path=tmp_path / "test_save_load_delete.json")
db2 = SimpleFileVectorStore(path=tmp_path, collection_name=collection_name)
assert db2.get("2") == [
0.4,
0.5,
0.6,
], "load function does not load data completely"
os.remove(tmp_path / "test_save_load_delete.json")
os.remove(tmp_path / collection_name)