Reduce the default chunk size in the reasoning pipeline to fit LLM capability
This commit is contained in:
parent
107bc7580e
commit
bff55230ba
|
@ -7,10 +7,12 @@ repos:
|
|||
- id: check-toml
|
||||
- id: end-of-file-fixer
|
||||
- id: trailing-whitespace
|
||||
- id: mixed-line-ending
|
||||
- id: detect-aws-credentials
|
||||
args: ["--allow-missing-credentials"]
|
||||
- id: detect-private-key
|
||||
- id: check-added-large-files
|
||||
- id: debug-statements
|
||||
- repo: https://github.com/ambv/black
|
||||
rev: 22.3.0
|
||||
hooks:
|
||||
|
|
|
@ -166,7 +166,7 @@ class DocumentRetrievalPipeline(BaseRetriever):
|
|||
)
|
||||
table_pages[doc.metadata["file_name"]].append(doc.metadata["page_label"])
|
||||
|
||||
queries = [
|
||||
queries: list[dict] = [
|
||||
{"$and": [{"file_name": {"$eq": fn}}, {"page_label": {"$in": pls}}]}
|
||||
for fn, pls in table_pages.items()
|
||||
]
|
||||
|
@ -174,7 +174,7 @@ class DocumentRetrievalPipeline(BaseRetriever):
|
|||
extra_docs = self.vector_retrieval(
|
||||
text="",
|
||||
top_k=50,
|
||||
where={"$or": queries},
|
||||
where=queries[0] if len(queries) == 1 else {"$or": queries},
|
||||
)
|
||||
for doc in extra_docs:
|
||||
if doc.doc_id not in retrieved_id:
|
||||
|
|
|
@ -6,7 +6,15 @@ class FileUpload(BasePage):
|
|||
def __init__(self, app):
|
||||
self._app = app
|
||||
self._supported_file_types = [
|
||||
"image", ".pdf", ".txt", ".csv", ".xlsx", ".doc", ".docx", ".pptx", ".html"
|
||||
"image",
|
||||
".pdf",
|
||||
".txt",
|
||||
".csv",
|
||||
".xlsx",
|
||||
".doc",
|
||||
".docx",
|
||||
".pptx",
|
||||
".html",
|
||||
]
|
||||
self.on_building_ui()
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ class PrepareEvidencePipeline(BaseComponent):
|
|||
"""
|
||||
|
||||
trim_func: TokenSplitter = TokenSplitter.withx(
|
||||
chunk_size=7600,
|
||||
chunk_size=3000,
|
||||
chunk_overlap=0,
|
||||
separator=" ",
|
||||
tokenizer=partial(
|
||||
|
@ -232,8 +232,12 @@ class FullQAPipeline(BaseComponent):
|
|||
self, message: str, conv_id: str, history: list, **kwargs # type: ignore
|
||||
) -> Document: # type: ignore
|
||||
docs = []
|
||||
doc_ids = []
|
||||
for retriever in self.retrievers:
|
||||
docs.extend(retriever(text=message))
|
||||
for doc in retriever(text=message):
|
||||
if doc.doc_id not in doc_ids:
|
||||
docs.append(doc)
|
||||
doc_ids.append(doc.doc_id)
|
||||
evidence_mode, evidence = self.evidence_pipeline(docs).content
|
||||
answer = await self.answering_pipeline(
|
||||
question=message, evidence=evidence, evidence_mode=evidence_mode
|
||||
|
@ -287,7 +291,7 @@ class FullQAPipeline(BaseComponent):
|
|||
|
||||
if not_detected:
|
||||
self.report_output(
|
||||
{"evidence": "Retrieved docs without matching evidence:\n"}
|
||||
{"evidence": "Retrieved segments without matching evidence:\n"}
|
||||
)
|
||||
for id in list(not_detected):
|
||||
self.report_output(
|
||||
|
|
Loading…
Reference in New Issue
Block a user