import os import markdown from fast_langdetect import detect from kotaemon.base import RetrievedDocument BASE_PATH = os.environ.get("GRADIO_ROOT_PATH", "") def is_close(val1, val2, tolerance=1e-9): return abs(val1 - val2) <= tolerance def replace_mardown_header(text: str) -> str: textlines = text.splitlines() newlines = [] for line in textlines: if line.startswith("#"): line = "" + line.replace("#", "") + "" if line.startswith("=="): line = "" newlines.append(line) return "\n".join(newlines) def get_header(doc: RetrievedDocument) -> str: """Get the header for the document""" header = "" if "page_label" in doc.metadata: header += f" [Page {doc.metadata['page_label']}]" header += f" {doc.metadata.get('file_name', '')}" return header.strip() class Render: """Default text rendering into HTML for the UI""" @staticmethod def collapsible(header, content, open: bool = False) -> str: """Render an HTML friendly collapsible section""" o = " open" if open else "" return ( f"
" f"{header}{content}

" ) @staticmethod def table(text: str) -> str: """Render table from markdown format into HTML""" text = replace_mardown_header(text) return markdown.markdown( text, extensions=[ "markdown.extensions.tables", "markdown.extensions.fenced_code", ], ) @staticmethod def preview( html_content: str, doc: RetrievedDocument, highlight_text: str | None = None, ) -> str: text = doc.content pdf_path = doc.metadata.get("file_path", "") if not os.path.isfile(pdf_path): print(f"pdf-path: {pdf_path} does not exist") return html_content is_pdf = doc.metadata.get("file_type", "") == "application/pdf" page_idx = int(doc.metadata.get("page_label", 1)) if not is_pdf: print("Document is not pdf") return html_content if page_idx < 0: print("Fail to extract page number") return html_content if not highlight_text: try: lang = detect(text.replace("\n", " "))["lang"] if lang not in ["ja", "cn"]: highlight_words = [ t[:-1] if t.endswith("-") else t for t in text.split("\n") ] highlight_text = highlight_words[0] phrase = "true" else: phrase = "false" highlight_text = ( text.replace("\n", "").replace('"', "").replace("'", "") ) except Exception as e: print(e) highlight_text = text else: phrase = "true" return f""" {html_content} [Preview] """ # noqa @staticmethod def highlight(text: str, elem_id: str | None = None) -> str: """Highlight text""" id_text = f" id='mark-{elem_id}'" if elem_id else "" return f"{text}" @staticmethod def image(url: str, text: str = "") -> str: """Render an image""" img = f'
' if text: caption = f"

{text}

" return f"
{img}{caption}

" return img @staticmethod def collapsible_with_header( doc: RetrievedDocument, open_collapsible: bool = False, ) -> str: header = f"{get_header(doc)}" if doc.metadata.get("type", "") == "image": doc_content = Render.image(url=doc.metadata["image_origin"], text=doc.text) else: doc_content = Render.table(doc.text) return Render.collapsible( header=Render.preview(header, doc), content=doc_content, open=open_collapsible, ) @staticmethod def collapsible_with_header_score( doc: RetrievedDocument, override_text: str | None = None, highlight_text: str | None = None, open_collapsible: bool = False, ) -> str: """Format the retrieval score and the document""" # score from doc_store (Elasticsearch) if is_close(doc.score, -1.0): vectorstore_score = "" text_search_str = " (full-text search)
" else: vectorstore_score = str(round(doc.score, 2)) text_search_str = "
" llm_reranking_score = ( round(doc.metadata["llm_trulens_score"], 2) if doc.metadata.get("llm_trulens_score") is not None else 0.0 ) reranking_score = ( round(doc.metadata["reranking_score"], 2) if doc.metadata.get("reranking_score") is not None else 0.0 ) item_type_prefix = doc.metadata.get("type", "") item_type_prefix = item_type_prefix.capitalize() if item_type_prefix: item_type_prefix += " from " if llm_reranking_score > 0: relevant_score = llm_reranking_score elif reranking_score > 0: relevant_score = reranking_score else: relevant_score = 0.0 rendered_score = Render.collapsible( header=f" Relevance score: {relevant_score:.1f}", content="  Vectorstore score:" f" {vectorstore_score}" f"{text_search_str}" "  LLM relevant score:" f" {llm_reranking_score}
" "  Reranking score:" f" {reranking_score}
", ) text = doc.text if not override_text else override_text if doc.metadata.get("type", "") == "image": rendered_doc_content = Render.image( url=doc.metadata["image_origin"], text=text, ) else: rendered_doc_content = Render.table(text) rendered_header = Render.preview( f"{item_type_prefix}{get_header(doc)}" f" [score: {llm_reranking_score}]", doc, highlight_text=highlight_text, ) return Render.collapsible( header=rendered_header, content=rendered_score + rendered_doc_content, open=open_collapsible, )