Add new OCRReader with PDF+OCR text merging (#66)

This change speeds up OCR extraction by allowing bypassing OCR for texts that are irrelevant (not in table).

---------

Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
This commit is contained in:
Tuan Anh Nguyen Dang (Tadashi_Cin)
2023-11-13 17:43:02 +07:00
committed by GitHub
parent d79b3744cb
commit 4704e2c11a
10 changed files with 523 additions and 126 deletions

View File

@@ -70,9 +70,11 @@ class ReaderIndexingPipeline(BaseComponent):
embedding=self.embedding,
)
text_splitter: SimpleNodeParser = SimpleNodeParser.withx(
chunk_size=1024, chunk_overlap=256
)
@Node.auto(depends_on=["chunk_size", "chunk_overlap"])
def text_splitter(self) -> SimpleNodeParser:
return SimpleNodeParser(
chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
)
def run(
self,