fix: improve citation logic (#578) bump:patch
This commit is contained in:
parent
3bd19f399f
commit
4fe080737a
|
@ -334,11 +334,19 @@ class AnswerWithContextPipeline(BaseComponent):
|
||||||
highlight_text = ""
|
highlight_text = ""
|
||||||
|
|
||||||
ss = sorted(ss, key=lambda x: x["start"])
|
ss = sorted(ss, key=lambda x: x["start"])
|
||||||
|
last_end = 0
|
||||||
text = cur_doc.text[: ss[0]["start"]]
|
text = cur_doc.text[: ss[0]["start"]]
|
||||||
|
|
||||||
for idx, span in enumerate(ss):
|
for idx, span in enumerate(ss):
|
||||||
to_highlight = cur_doc.text[span["start"] : span["end"]]
|
# prevent overlapping between span
|
||||||
if len(to_highlight) > len(highlight_text):
|
span_start = max(last_end, span["start"])
|
||||||
highlight_text = to_highlight
|
span_end = max(last_end, span["end"])
|
||||||
|
|
||||||
|
to_highlight = cur_doc.text[span_start:span_end]
|
||||||
|
last_end = span_end
|
||||||
|
|
||||||
|
# append to highlight on PDF viewer
|
||||||
|
highlight_text += (" " if highlight_text else "") + to_highlight
|
||||||
|
|
||||||
span_idx = span.get("idx", None)
|
span_idx = span.get("idx", None)
|
||||||
if span_idx is not None:
|
if span_idx is not None:
|
||||||
|
@ -350,6 +358,7 @@ class AnswerWithContextPipeline(BaseComponent):
|
||||||
)
|
)
|
||||||
if idx < len(ss) - 1:
|
if idx < len(ss) - 1:
|
||||||
text += cur_doc.text[span["end"] : ss[idx + 1]["start"]]
|
text += cur_doc.text[span["end"] : ss[idx + 1]["start"]]
|
||||||
|
|
||||||
text += cur_doc.text[ss[-1]["end"] :]
|
text += cur_doc.text[ss[-1]["end"] :]
|
||||||
# add to display list
|
# add to display list
|
||||||
with_citation.append(
|
with_citation.append(
|
||||||
|
|
|
@ -152,6 +152,7 @@ class AnswerWithInlineCitation(AnswerWithContextPipeline):
|
||||||
def replace_citation_with_link(self, answer: str):
|
def replace_citation_with_link(self, answer: str):
|
||||||
# Define the regex pattern to match 【number】
|
# Define the regex pattern to match 【number】
|
||||||
pattern = r"【\d+】"
|
pattern = r"【\d+】"
|
||||||
|
alternate_pattern = r"\[\d+\]"
|
||||||
|
|
||||||
# Regular expression to match merged citations
|
# Regular expression to match merged citations
|
||||||
multi_pattern = r"【([\d,\s]+)】"
|
multi_pattern = r"【([\d,\s]+)】"
|
||||||
|
@ -166,7 +167,9 @@ class AnswerWithInlineCitation(AnswerWithContextPipeline):
|
||||||
answer = re.sub(multi_pattern, split_citations, answer)
|
answer = re.sub(multi_pattern, split_citations, answer)
|
||||||
|
|
||||||
# Find all citations in the answer
|
# Find all citations in the answer
|
||||||
matches = re.finditer(pattern, answer)
|
matches = list(re.finditer(pattern, answer))
|
||||||
|
if not matches:
|
||||||
|
matches = list(re.finditer(alternate_pattern, answer))
|
||||||
|
|
||||||
matched_citations = set()
|
matched_citations = set()
|
||||||
for match in matches:
|
for match in matches:
|
||||||
|
@ -174,11 +177,12 @@ class AnswerWithInlineCitation(AnswerWithContextPipeline):
|
||||||
matched_citations.add(citation)
|
matched_citations.add(citation)
|
||||||
|
|
||||||
for citation in matched_citations:
|
for citation in matched_citations:
|
||||||
|
citation_id = citation[1:-1]
|
||||||
answer = answer.replace(
|
answer = answer.replace(
|
||||||
citation,
|
citation,
|
||||||
(
|
(
|
||||||
"<a href='#' class='citation' "
|
"<a href='#' class='citation' "
|
||||||
f"id='mark-{citation[1:-1]}'>{citation}</a>"
|
f"id='mark-{citation_id}'>【{citation_id}】</a>"
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -5,17 +5,38 @@ def find_text(search_span, context, min_length=5):
|
||||||
sentence_list = search_span.split("\n")
|
sentence_list = search_span.split("\n")
|
||||||
context = context.replace("\n", " ")
|
context = context.replace("\n", " ")
|
||||||
|
|
||||||
matches = []
|
matches_span = []
|
||||||
# don't search for small text
|
# don't search for small text
|
||||||
if len(search_span) > min_length:
|
if len(search_span) > min_length:
|
||||||
for sentence in sentence_list:
|
for sentence in sentence_list:
|
||||||
match = SequenceMatcher(
|
match_results = SequenceMatcher(
|
||||||
None, sentence, context, autojunk=False
|
None,
|
||||||
).find_longest_match()
|
sentence,
|
||||||
if match.size > max(len(sentence) * 0.35, min_length):
|
context,
|
||||||
matches.append((match.b, match.b + match.size))
|
autojunk=False,
|
||||||
|
).get_matching_blocks()
|
||||||
|
|
||||||
return matches
|
matched_blocks = []
|
||||||
|
for _, start, length in match_results:
|
||||||
|
if length > max(len(sentence) * 0.2, min_length):
|
||||||
|
matched_blocks.append((start, start + length))
|
||||||
|
|
||||||
|
if matched_blocks:
|
||||||
|
start_index = min(start for start, _ in matched_blocks)
|
||||||
|
end_index = max(end for _, end in matched_blocks)
|
||||||
|
length = end_index - start_index
|
||||||
|
|
||||||
|
if length > max(len(sentence) * 0.35, min_length):
|
||||||
|
matches_span.append((start_index, end_index))
|
||||||
|
|
||||||
|
if matches_span:
|
||||||
|
# merge all matches into one span
|
||||||
|
final_span = min(start for start, _ in matches_span), max(
|
||||||
|
end for _, end in matches_span
|
||||||
|
)
|
||||||
|
matches_span = [final_span]
|
||||||
|
|
||||||
|
return matches_span
|
||||||
|
|
||||||
|
|
||||||
def find_start_end_phrase(
|
def find_start_end_phrase(
|
||||||
|
|
|
@ -277,7 +277,6 @@ span.icon {
|
||||||
}
|
}
|
||||||
|
|
||||||
pdfjs-viewer-element {
|
pdfjs-viewer-element {
|
||||||
height: 100vh;
|
|
||||||
height: 100dvh;
|
height: 100dvh;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -290,9 +289,8 @@ pdfjs-viewer-element {
|
||||||
left: 0;
|
left: 0;
|
||||||
top: 0;
|
top: 0;
|
||||||
width: 100%;
|
width: 100%;
|
||||||
height: 100%;
|
height: 85dvh;
|
||||||
overflow: auto;
|
overflow: hidden;
|
||||||
background-color: rgb(0, 0, 0);
|
|
||||||
background-color: rgba(0, 0, 0, 0.4);
|
background-color: rgba(0, 0, 0, 0.4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -302,7 +300,7 @@ pdfjs-viewer-element {
|
||||||
|
|
||||||
.modal-content {
|
.modal-content {
|
||||||
background-color: #fefefe;
|
background-color: #fefefe;
|
||||||
height: 110%;
|
height: 100%;
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
}
|
}
|
||||||
|
@ -323,7 +321,7 @@ pdfjs-viewer-element {
|
||||||
|
|
||||||
.modal-body {
|
.modal-body {
|
||||||
flex: 1;
|
flex: 1;
|
||||||
overflow: auto;
|
overflow: hidden;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Switch checkbox styles */
|
/* Switch checkbox styles */
|
||||||
|
|
|
@ -32,7 +32,6 @@ function run() {
|
||||||
globalThis.toggleChatColumn = (() => {
|
globalThis.toggleChatColumn = (() => {
|
||||||
/* get flex-grow value of chat_column */
|
/* get flex-grow value of chat_column */
|
||||||
let flex_grow = conv_column.style.flexGrow;
|
let flex_grow = conv_column.style.flexGrow;
|
||||||
console.log("chat col", flex_grow);
|
|
||||||
if (flex_grow == '0') {
|
if (flex_grow == '0') {
|
||||||
conv_column.style.flexGrow = '1';
|
conv_column.style.flexGrow = '1';
|
||||||
conv_column.style.minWidth = default_conv_column_min_width;
|
conv_column.style.minWidth = default_conv_column_min_width;
|
||||||
|
@ -95,10 +94,24 @@ function run() {
|
||||||
event.preventDefault(); // Prevent the default link behavior
|
event.preventDefault(); // Prevent the default link behavior
|
||||||
var citationId = event.target.getAttribute('id');
|
var citationId = event.target.getAttribute('id');
|
||||||
|
|
||||||
await sleep(100); // Sleep for 500 milliseconds
|
await sleep(100); // Sleep for 100 milliseconds
|
||||||
|
|
||||||
|
// check if modal is open
|
||||||
|
var modal = document.getElementById("pdf-modal");
|
||||||
var citation = document.querySelector('mark[id="' + citationId + '"]');
|
var citation = document.querySelector('mark[id="' + citationId + '"]');
|
||||||
if (citation) {
|
|
||||||
citation.scrollIntoView({ behavior: 'smooth' });
|
if (modal.style.display == "block") {
|
||||||
|
// trigger on click event of PDF Preview link
|
||||||
|
var detail_elem = citation;
|
||||||
|
// traverse up the DOM tree to find the parent element with tag detail
|
||||||
|
while (detail_elem.tagName.toLowerCase() != "details") {
|
||||||
|
detail_elem = detail_elem.parentElement;
|
||||||
|
}
|
||||||
|
detail_elem.getElementsByClassName("pdf-link").item(0).click();
|
||||||
|
} else {
|
||||||
|
if (citation) {
|
||||||
|
citation.scrollIntoView({ behavior: 'smooth' });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,16 +43,52 @@ function onBlockLoad () {
|
||||||
modal.style.position = "fixed";
|
modal.style.position = "fixed";
|
||||||
modal.style.width = "70%";
|
modal.style.width = "70%";
|
||||||
modal.style.left = "15%";
|
modal.style.left = "15%";
|
||||||
|
modal.style.height = "100dvh";
|
||||||
} else {
|
} else {
|
||||||
modal.style.position = old_position;
|
modal.style.position = old_position;
|
||||||
modal.style.width = old_width;
|
modal.style.width = old_width;
|
||||||
modal.style.left = old_left;
|
modal.style.left = old_left;
|
||||||
|
modal.style.height = "85dvh";
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
globalThis.compareText = (search_phrase, page_label) => {
|
||||||
|
var iframe = document.querySelector("#pdf-viewer").iframe;
|
||||||
|
var innerDoc = (iframe.contentDocument) ? iframe.contentDocument : iframe.contentWindow.document;
|
||||||
|
|
||||||
|
var query_selector = (
|
||||||
|
"#viewer > div[data-page-number='" +
|
||||||
|
page_label +
|
||||||
|
"'] > div.textLayer > span"
|
||||||
|
);
|
||||||
|
var page_spans = innerDoc.querySelectorAll(query_selector);
|
||||||
|
for (var i = 0; i < page_spans.length; i++) {
|
||||||
|
var span = page_spans[i];
|
||||||
|
if (
|
||||||
|
span.textContent.length > 4 &&
|
||||||
|
(
|
||||||
|
search_phrase.includes(span.textContent) ||
|
||||||
|
span.textContent.includes(search_phrase)
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
span.innerHTML = "<span class='highlight selected'>" + span.textContent + "</span>";
|
||||||
|
} else {
|
||||||
|
// if span is already highlighted, remove it
|
||||||
|
if (span.querySelector(".highlight")) {
|
||||||
|
span.innerHTML = span.textContent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sleep function using Promise and setTimeout
|
||||||
|
function sleep(ms) {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
// Function to open modal and display PDF
|
// Function to open modal and display PDF
|
||||||
globalThis.openModal = (event) => {
|
globalThis.openModal = async (event) => {
|
||||||
event.preventDefault();
|
event.preventDefault();
|
||||||
var target = event.currentTarget;
|
var target = event.currentTarget;
|
||||||
var src = target.getAttribute("data-src");
|
var src = target.getAttribute("data-src");
|
||||||
|
@ -66,8 +102,8 @@ function onBlockLoad () {
|
||||||
if (current_src != src) {
|
if (current_src != src) {
|
||||||
pdfViewer.setAttribute("src", src);
|
pdfViewer.setAttribute("src", src);
|
||||||
}
|
}
|
||||||
pdfViewer.setAttribute("phrase", phrase);
|
// pdfViewer.setAttribute("phrase", phrase);
|
||||||
pdfViewer.setAttribute("search", search);
|
// pdfViewer.setAttribute("search", search);
|
||||||
pdfViewer.setAttribute("page", page);
|
pdfViewer.setAttribute("page", page);
|
||||||
|
|
||||||
var scrollableDiv = document.getElementById("chat-info-panel");
|
var scrollableDiv = document.getElementById("chat-info-panel");
|
||||||
|
@ -80,6 +116,10 @@ function onBlockLoad () {
|
||||||
info_panel.style.display = "none";
|
info_panel.style.display = "none";
|
||||||
}
|
}
|
||||||
scrollableDiv.scrollTop = 0;
|
scrollableDiv.scrollTop = 0;
|
||||||
|
|
||||||
|
/* search for text inside PDF page */
|
||||||
|
await sleep(500);
|
||||||
|
compareText(search, page);
|
||||||
}
|
}
|
||||||
|
|
||||||
globalThis.assignPdfOnclickEvent = () => {
|
globalThis.assignPdfOnclickEvent = () => {
|
||||||
|
@ -93,7 +133,6 @@ function onBlockLoad () {
|
||||||
var created_modal = document.getElementById("pdf-viewer");
|
var created_modal = document.getElementById("pdf-viewer");
|
||||||
if (!created_modal) {
|
if (!created_modal) {
|
||||||
createModal();
|
createModal();
|
||||||
console.log("Created modal")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user