Add new OCRReader with PDF+OCR text merging (#66)

This change speeds up OCR extraction by allowing bypassing OCR for texts that are irrelevant (not in table).

---------

Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
This commit is contained in:
Tuan Anh Nguyen Dang (Tadashi_Cin)
2023-11-13 17:43:02 +07:00
committed by GitHub
parent d79b3744cb
commit 4704e2c11a
10 changed files with 523 additions and 126 deletions

View File

@@ -2,7 +2,7 @@ import os
from pathlib import Path
from typing import List
from theflow import Node, Param
from theflow import Node
from theflow.utils.modules import ObjectInitDeclaration as _
from kotaemon.base import BaseComponent
@@ -43,8 +43,8 @@ class QuestionAnsweringPipeline(BaseComponent):
request_timeout=60,
)
vector_store: Param[InMemoryVectorStore] = Param(_(InMemoryVectorStore))
doc_store: Param[InMemoryDocumentStore] = Param(_(InMemoryDocumentStore))
vector_store: _[InMemoryVectorStore] = _(InMemoryVectorStore)
doc_store: _[InMemoryDocumentStore] = _(InMemoryDocumentStore)
embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx(
model="text-embedding-ada-002",