diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0ed14b7..ac08617 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,10 +7,12 @@ repos: - id: check-toml - id: end-of-file-fixer - id: trailing-whitespace + - id: mixed-line-ending - id: detect-aws-credentials args: ["--allow-missing-credentials"] - id: detect-private-key - id: check-added-large-files + - id: debug-statements - repo: https://github.com/ambv/black rev: 22.3.0 hooks: diff --git a/libs/ktem/ktem/indexing/file.py b/libs/ktem/ktem/indexing/file.py index 2019660..57d6b1b 100644 --- a/libs/ktem/ktem/indexing/file.py +++ b/libs/ktem/ktem/indexing/file.py @@ -166,7 +166,7 @@ class DocumentRetrievalPipeline(BaseRetriever): ) table_pages[doc.metadata["file_name"]].append(doc.metadata["page_label"]) - queries = [ + queries: list[dict] = [ {"$and": [{"file_name": {"$eq": fn}}, {"page_label": {"$in": pls}}]} for fn, pls in table_pages.items() ] @@ -174,7 +174,7 @@ class DocumentRetrievalPipeline(BaseRetriever): extra_docs = self.vector_retrieval( text="", top_k=50, - where={"$or": queries}, + where=queries[0] if len(queries) == 1 else {"$or": queries}, ) for doc in extra_docs: if doc.doc_id not in retrieved_id: diff --git a/libs/ktem/ktem/pages/chat/upload.py b/libs/ktem/ktem/pages/chat/upload.py index f60910d..809f522 100644 --- a/libs/ktem/ktem/pages/chat/upload.py +++ b/libs/ktem/ktem/pages/chat/upload.py @@ -6,7 +6,15 @@ class FileUpload(BasePage): def __init__(self, app): self._app = app self._supported_file_types = [ - "image", ".pdf", ".txt", ".csv", ".xlsx", ".doc", ".docx", ".pptx", ".html" + "image", + ".pdf", + ".txt", + ".csv", + ".xlsx", + ".doc", + ".docx", + ".pptx", + ".html", ] self.on_building_ui() diff --git a/libs/ktem/ktem/reasoning/simple.py b/libs/ktem/ktem/reasoning/simple.py index 67ae455..386320d 100644 --- a/libs/ktem/ktem/reasoning/simple.py +++ b/libs/ktem/ktem/reasoning/simple.py @@ -33,7 +33,7 @@ class PrepareEvidencePipeline(BaseComponent): """ trim_func: TokenSplitter = TokenSplitter.withx( - chunk_size=7600, + chunk_size=3000, chunk_overlap=0, separator=" ", tokenizer=partial( @@ -232,8 +232,12 @@ class FullQAPipeline(BaseComponent): self, message: str, conv_id: str, history: list, **kwargs # type: ignore ) -> Document: # type: ignore docs = [] + doc_ids = [] for retriever in self.retrievers: - docs.extend(retriever(text=message)) + for doc in retriever(text=message): + if doc.doc_id not in doc_ids: + docs.append(doc) + doc_ids.append(doc.doc_id) evidence_mode, evidence = self.evidence_pipeline(docs).content answer = await self.answering_pipeline( question=message, evidence=evidence, evidence_mode=evidence_mode @@ -287,7 +291,7 @@ class FullQAPipeline(BaseComponent): if not_detected: self.report_output( - {"evidence": "Retrieved docs without matching evidence:\n"} + {"evidence": "Retrieved segments without matching evidence:\n"} ) for id in list(not_detected): self.report_output(