Add new OCRReader with PDF+OCR text merging (#66)
This change speeds up OCR extraction by allowing bypassing OCR for texts that are irrelevant (not in table). --------- Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
This commit is contained in:
committed by
GitHub
parent
d79b3744cb
commit
4704e2c11a
@@ -70,9 +70,11 @@ class ReaderIndexingPipeline(BaseComponent):
|
||||
embedding=self.embedding,
|
||||
)
|
||||
|
||||
text_splitter: SimpleNodeParser = SimpleNodeParser.withx(
|
||||
chunk_size=1024, chunk_overlap=256
|
||||
)
|
||||
@Node.auto(depends_on=["chunk_size", "chunk_overlap"])
|
||||
def text_splitter(self) -> SimpleNodeParser:
|
||||
return SimpleNodeParser(
|
||||
chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
|
||||
)
|
||||
|
||||
def run(
|
||||
self,
|
||||
|
Reference in New Issue
Block a user