Migrate the MVP into kotaemon (#108)

- Migrate the MVP into kotaemon.
- Preliminary include the pipeline within chatbot interface.
- Organize MVP as an application.

Todo:

- Add an info panel to view the planning of agents -> Fix streaming agents' output.

Resolve: #60
Resolve: #61 
Resolve: #62
This commit is contained in:
Duc Nguyen (john)
2024-01-10 15:28:09 +07:00
committed by GitHub
parent 230328c62f
commit 5a9d6f75be
31 changed files with 273 additions and 92 deletions

View File

@@ -47,6 +47,7 @@ def generate_chat_completion_obj(text):
"function_call": None,
"tool_calls": None,
},
"logprobs": None,
}
],
"usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},

View File

@@ -30,6 +30,7 @@ _openai_chat_completion_response = [
},
"tool_calls": None,
},
"logprobs": None,
}
],
"usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},

View File

@@ -30,6 +30,7 @@ _openai_chat_completion_response = ChatCompletion.parse_obj(
"finish_reason": "length",
"logprobs": None,
},
"logprobs": None,
}
],
"usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},

View File

@@ -23,6 +23,7 @@ _openai_chat_completion_response = [
"function_call": None,
"tool_calls": None,
},
"logprobs": None,
}
],
"usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},

15
tests/test_ingestor.py Normal file
View File

@@ -0,0 +1,15 @@
from pathlib import Path
from kotaemon.indices.ingests import DocumentIngestor
from kotaemon.indices.splitters import TokenSplitter
def test_ingestor_include_src():
dirpath = Path(__file__).parent
ingestor = DocumentIngestor(
pdf_mode="normal",
text_splitter=TokenSplitter(chunk_size=50, chunk_overlap=10),
)
nodes = ingestor(dirpath / "resources" / "table.pdf")
assert type(nodes) is list
assert nodes[0].relationships

View File

@@ -28,6 +28,7 @@ _openai_chat_completion_response = ChatCompletion.parse_obj(
"function_call": None,
"tool_calls": None,
},
"logprobs": None,
}
],
"usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},

View File

@@ -25,6 +25,7 @@ _openai_chat_completion_responses = [
"function_call": None,
"tool_calls": None,
},
"logprobs": None,
}
],
"usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},

55
tests/test_splitter.py Normal file
View File

@@ -0,0 +1,55 @@
from llama_index.schema import NodeRelationship
from kotaemon.base import Document
from kotaemon.indices.splitters import TokenSplitter
source1 = Document(
content="The City Hall and Raffles Place MRT stations are paired cross-platform "
"interchanges on the NorthSouth line (NSL) and EastWest line (EWL) of the "
"Singapore Mass Rapid Transit (MRT) system. Both are situated in the Downtown "
"Core district: City Hall station is near landmarks such as the former City Hall, "
"St Andrew's Cathedral and the Padang, while Raffles Place station serves Merlion "
"Park, The Fullerton Hotel and the Asian Civilisations Museum. The stations were "
"first announced in 1982. Constructing the tunnels between the City Hall and "
"Raffles Place stations required the draining of the Singapore River. The "
"stations opened on 12 December 1987 as part of the MRT extension to Outram Park "
"station. Cross-platform transfers between the NSL and EWL began on 28 October "
"1989, ahead of the split of the MRT network into two lines. Both stations are "
"designated Civil Defence shelters. City Hall station features a mural by Simon"
"Wong which depicts government buildings in the area, while two murals at Raffles "
"Place station by Lim Sew Yong and Thang Kiang How depict scenes of Singapore's "
"history"
)
source2 = Document(
content="The pink cockatoo (Cacatua leadbeateri) is a medium-sized cockatoo that "
"inhabits arid and semi-arid inland areas across Australia, with the exception of "
"the north east. The bird has a soft-textured white and salmon-pink plumage and "
"large, bright red and yellow crest. The sexes are quite similar, although males "
"are usually bigger while the female has a broader yellow stripe on the crest and "
"develops a red eye when mature. The pink cockatoo is usually found in pairs or "
"small groups, and feeds both on the ground and in trees. It is listed as an "
"endangered species by the Australian government. Formerly known as Major "
"Mitchell's cockatoo, after the explorer Thomas Mitchell, the species was "
"officially renamed the pink cockatoo in 2023 by BirdLife Australia in light of "
"Mitchell's involvement in the massacre of Aboriginal people at Mount Dispersion, "
"as well as a general trend to make Australian species names more culturally "
"inclusive. This pink cockatoo with a raised crest was photographed near Mount "
"Grenfell in New South Wales."
)
def test_split_token():
"""Test that it can split tokens successfully"""
splitter = TokenSplitter(chunk_size=30, chunk_overlap=10)
chunks = splitter([source1, source2])
assert isinstance(chunks, list), "Chunks should be a list"
assert isinstance(chunks[0], Document), "Chunks should be a list of Documents"
assert chunks[0].relationships[NodeRelationship.SOURCE].node_id == source1.doc_id
assert (
chunks[1].relationships[NodeRelationship.PREVIOUS].node_id == chunks[0].doc_id
)
assert chunks[1].relationships[NodeRelationship.NEXT].node_id == chunks[2].doc_id
assert chunks[-1].relationships[NodeRelationship.SOURCE].node_id == source2.doc_id