feat: validate hypav2 data

# Changelist:
## 1. Types
### MainChunks
Added id(int), and chatMemos(Set<string>)
Id: incremental int starting from 0
chatMemos: A set of UUID, containing which chat has been summarized in it.
### Chunks
mainChunkID: A connection of which mainChunk it has been split from
text: the split text data

## 2. Features
### CleanInvalidChunks
Called every time when chat is updated and Hypamemory is used.

Gets all the memo(UUID)s of current chats, and creates a set.
Then checks if each mainChunk's chatMemos set is subset of the enitre memo set. If not, the summarized part's chat is deleted/edited. The mainChunk is filtered out.
Concurrently, the chunks that are split from that mainChunk is also deleted.
This commit is contained in:
LightningHyperBlaze45654
2024-12-03 23:28:18 -08:00
parent 46502e762b
commit b8bb2330cc
2 changed files with 69 additions and 86 deletions

View File

@@ -1,7 +1,6 @@
import localforage from "localforage"; import localforage from "localforage";
import {globalFetch} from "src/ts/globalApi.svelte"; import {globalFetch} from "src/ts/globalApi.svelte";
import {runEmbedding} from "../transformers"; import {runEmbedding} from "../transformers";
import { alertError } from "src/ts/alert";
import {appendLastPath} from "src/ts/util"; import {appendLastPath} from "src/ts/util";
@@ -139,8 +138,7 @@ export class HypaProcesser{
} }
async similaritySearchScored(query: string) { async similaritySearchScored(query: string) {
const results = await this.similaritySearchVectorWithScore((await this.getEmbeds(query))[0],); return await this.similaritySearchVectorWithScore((await this.getEmbeds(query))[0],)
return results
} }
private async similaritySearchVectorWithScore( private async similaritySearchVectorWithScore(

View File

@@ -13,15 +13,15 @@ import { runSummarizer } from "../transformers";
import { parseChatML } from "src/ts/parser.svelte"; import { parseChatML } from "src/ts/parser.svelte";
export interface HypaV2Data { export interface HypaV2Data {
chunks: { lastMainChunkId: number; // can be removed, but exists to more readability of the code.
mainChunks: { // summary itself
id: number;
text: string; text: string;
targetId: string; chatMemos: Set<string>; // UUIDs of summarized chats
chatRange: [number, number]; // Start and end indices of chats summarized
}[]; }[];
mainChunks: { chunks: { // split mainChunks for retrieval or something. Although quite uncomfortable logic, so maybe I will delete it soon or later.
mainChunkID: number;
text:string; text:string;
targetId: string;
chatRange: [number, number]; // Start and end indices of chats summarized
}[]; }[];
} }
@@ -138,60 +138,41 @@ async function summary(
result = da.result; result = da.result;
} }
return { success: true, data: result }; return { success: true, data: result };
} // No, I am not going to touch any http API calls.
function isSubset<T>(subset: Set<T>, superset: Set<T>): boolean { // simple helper function. Check if subset IS a subset of superset given.
for (const item of subset) {
if (!superset.has(item)) {
return false;
}
}
return true;
} }
function cleanInvalidChunks( function cleanInvalidChunks(
chats: OpenAIChat[], chats: OpenAIChat[],
data: HypaV2Data, data: HypaV2Data,
editedChatIndex?: number
): void { ): void {
// If editedChatIndex is provided, remove chunks and mainChunks that summarize chats from that index onwards const currentChatMemos = new Set(chats.map((chat) => chat.memo)); // if chunk's memo set is not subset of this, the chunk's content -> delete
if (editedChatIndex !== undefined) {
data.mainChunks = data.mainChunks.filter(
(chunk) => chunk.chatRange[1] < editedChatIndex
);
data.chunks = data.chunks.filter(
(chunk) => chunk.chatRange[1] < editedChatIndex
);
} else {
// Confirmed that chat.memo is indeed unique uuid
const currentChatIds = new Set(chats.map((chat) => chat.memo));
// 존재하지 않는 챗의 요약본 삭제 // mainChunks filtering
data.mainChunks = data.mainChunks.filter((chunk) => { data.mainChunks = data.mainChunks.filter((mainChunk) => {
const [startIdx, endIdx] = chunk.chatRange; return isSubset(mainChunk.chatMemos, currentChatMemos);
// Check if all chats in the range exist
for (let i = startIdx; i <= endIdx; i++) {
if (!currentChatIds.has(chats[i]?.memo)) {
console.log(`Removing this mainChunk(summary) due to chat context change: ${chunk}`);
return false; // false로 filtering
}
}
return true;
}); });
// chunk filtering based on mainChunk's id
const validMainChunkIds = new Set(data.mainChunks.map((mainChunk) => mainChunk.id));
data.chunks = data.chunks.filter((chunk) =>
validMainChunkIds.has(chunk.mainChunkID)
);
data.lastMainChunkId = data.mainChunks[-1].id; // Quite literally the definition of lastMainChunkId. Didn't use .length, since middle chat context can be partially deleted.
// 같은거, 근데 이건 쪼개진 chunk들에 대하여 수행
data.chunks = data.chunks.filter((chunk) => {
const [startIdx, endIdx] = chunk.chatRange;
// 생성된 chunks는 더이상 mainChunks와 연결되지 않음. 따라서 같은 작업을 진행해야 한다.
for (let i = startIdx; i <= endIdx; i++) {
if (!currentChatIds.has(chats[i]?.memo)) {
console.log(`Removing this chunk(split) due to chat context change: ${chunk}`);
return false;
}
}
return true;
});
}
} }
export async function regenerateSummary( export async function regenerateSummary(
chats: OpenAIChat[], chats: OpenAIChat[],
data: HypaV2Data, data: HypaV2Data,
mainChunkIndex: number mainChunkIndex: number
) : Promise<void> { ) : Promise<void> {
// Should re-summarize a certain main chunk, based on index. It will then replace the original one. How much chat needs to be summarized is already defined in the mainChunk's chatRange field. const targetMainChunk = data.mainChunks[mainChunkIndex];
// After the update on mainChunks, it should also update chunks that have the same ChatRange, as they should be updated with the newly generated summary. Follow the same principles of splitting them.
} }
export async function hypaMemoryV2( export async function hypaMemoryV2(
@@ -200,8 +181,7 @@ export async function hypaMemoryV2(
maxContextTokens: number, maxContextTokens: number,
room: Chat, room: Chat,
char: character | groupChat, char: character | groupChat,
tokenizer: ChatTokenizer, tokenizer: ChatTokenizer
editedChatIndex?: number
): Promise<{ ): Promise<{
currentTokens: number; currentTokens: number;
chats: OpenAIChat[]; chats: OpenAIChat[];
@@ -209,51 +189,48 @@ export async function hypaMemoryV2(
memory?: HypaV2Data; memory?: HypaV2Data;
}> { }> {
const db = getDatabase(); const db = getDatabase();
const data: HypaV2Data = room.hypaV2Data ?? { chunks: [], mainChunks: [] }; const data: HypaV2Data = room.hypaV2Data ?? {
lastMainChunkId: 0,
chunks: [],
mainChunks: []
};
// Clean invalid chunks based on the edited chat index // Clean invalid HypaV2 data
cleanInvalidChunks(chats, data, editedChatIndex); cleanInvalidChunks(chats, data);
let allocatedTokens = db.hypaAllocatedTokens; let allocatedTokens = db.hypaAllocatedTokens;
let chunkSize = db.hypaChunkSize; let chunkSize = db.hypaChunkSize;
currentTokens += allocatedTokens + 50; currentTokens += allocatedTokens + chats.length * 4; // ChatML token counting from official openai documentation
let mainPrompt = ""; let mainPrompt = "";
const lastTwoChats = chats.slice(-2); const lastTwoChats = chats.slice(-2);
// Error handling for infinite summarization attempts
let summarizationFailures = 0; let summarizationFailures = 0;
const maxSummarizationFailures = 3; const maxSummarizationFailures = 3;
const summarizedIndices = new Set<number>(); const summarizedMemos = new Set<string>();
// Token management loop // Token management loop
while (currentTokens >= maxContextTokens) { while (currentTokens >= maxContextTokens) {
let idx = 0; let idx = 0;
let targetId = "";
const halfData: OpenAIChat[] = []; const halfData: OpenAIChat[] = [];
let halfDataTokens = 0; let halfDataTokens = 0;
let startIdx = -1;
// Find the next batch of chats to summarize // Accumulate chats to summarize
while ( while (
halfDataTokens < chunkSize && halfDataTokens < chunkSize &&
idx < chats.length - 2 // Ensure latest two chats are not added to summarization. idx < chats.length - 2 // Ensure latest two chats are not added to summarization.
) { ) {
if (!summarizedIndices.has(idx)) {
const chat = chats[idx]; const chat = chats[idx];
if (startIdx === -1) startIdx = idx; if (!summarizedMemos.has(chat.memo)) {
halfDataTokens += await tokenizer.tokenizeChat(chat); halfDataTokens += await tokenizer.tokenizeChat(chat);
halfData.push(chat); halfData.push(chat);
targetId = chat.memo;
} }
idx++; idx++;
} }
// End index gone due to using UUID sets
// Last two chats must not be summarized, else request will be broken
const endIdx = idx - 1; // End index of the chats being summarized
// Avoid summarizing the last two chats
if (halfData.length < 3) break; if (halfData.length < 3) break;
const stringlizedChat = halfData const stringlizedChat = halfData // please change this name to something else
.map((e) => `${e.role}: ${e.content}`) .map((e) => `${e.role}: ${e.content}`)
.join("\n"); .join("\n");
const summaryData = await summary(stringlizedChat); const summaryData = await summary(stringlizedChat);
@@ -281,10 +258,15 @@ export async function hypaMemoryV2(
currentTokens -= halfDataTokens; currentTokens -= halfDataTokens;
allocatedTokens -= summaryDataToken; allocatedTokens -= summaryDataToken;
data.mainChunks.unshift({ // lastMainChunkId updating(increment)
data.lastMainChunkId++;
const newMainChunkId = data.lastMainChunkId;
const chatMemos = new Set(halfData.map((chat) => chat.memo));
data.mainChunks.push({
id: newMainChunkId,
text: summaryData.data, text: summaryData.data,
targetId: targetId, chatMemos: chatMemos,
chatRange: [startIdx, endIdx],
}); });
// Split the summary into chunks based on double line breaks // Split the summary into chunks based on double line breaks
@@ -296,15 +278,14 @@ export async function hypaMemoryV2(
// Update chunks with the new summary // Update chunks with the new summary
data.chunks.push( data.chunks.push(
...splitted.map((e) => ({ ...splitted.map((e) => ({
mainChunkID: newMainChunkId,
text: e, text: e,
targetId: targetId,
chatRange: [startIdx, endIdx] as [number, number],
})) }))
); );
// Mark the chats as summarized // Mark the chats as summarized
for (let i = startIdx; i <= endIdx; i++) { for (const memo of chatMemos) {
summarizedIndices.add(i); summarizedMemos.add(memo);
} }
} }
@@ -325,15 +306,18 @@ export async function hypaMemoryV2(
const processor = new HypaProcesser(db.hypaModel); const processor = new HypaProcesser(db.hypaModel);
processor.oaikey = db.supaMemoryKey; processor.oaikey = db.supaMemoryKey;
const searchDocumentPrefix = "search_document: ";
const prefixLength = searchDocumentPrefix.length;
// Add chunks to processor for similarity search // Add chunks to processor for similarity search
await processor.addText( await processor.addText(
data.chunks data.chunks
.filter((v) => v.text.trim().length > 0) .filter((v) => v.text.trim().length > 0)
.map((v) => "search_document: " + v.text.trim()) .map((v) => searchDocumentPrefix + v.text.trim()) // sometimes this should not be used at all. RisuAI does not support embedding model that this is meaningful, isn't it?
); );
let scoredResults: { [key: string]: number } = {}; let scoredResults: { [key: string]: number } = {};
for (let i = 0; i < 3; i++) { for (let i = 0; i < 3; i++) { // Should parameterize this, fixed length 3 is a magic number without explanation
const pop = chats[chats.length - i - 1]; const pop = chats[chats.length - i - 1];
if (!pop) break; if (!pop) break;
const searched = await processor.similaritySearchScored( const searched = await processor.similaritySearchScored(
@@ -355,16 +339,17 @@ export async function hypaMemoryV2(
scoredArray.length > 0 scoredArray.length > 0
) { ) {
const [text] = scoredArray.shift(); const [text] = scoredArray.shift();
const content = text.substring(prefixLength);
const tokenized = await tokenizer.tokenizeChat({ const tokenized = await tokenizer.tokenizeChat({
role: "system", role: "system",
content: text.substring(14), content: content,
}); });
if ( if (
tokenized > tokenized >
allocatedTokens - mainPromptTokens - chunkResultTokens allocatedTokens - mainPromptTokens - chunkResultTokens
) )
break; break;
chunkResultPrompts += text.substring(14) + "\n\n"; chunkResultPrompts += content + "\n\n";
chunkResultTokens += tokenized; chunkResultTokens += tokenized;
} }
@@ -372,7 +357,7 @@ export async function hypaMemoryV2(
// Filter out summarized chats // Filter out summarized chats
const unsummarizedChats = chats.filter( const unsummarizedChats = chats.filter(
(_, idx) => !summarizedIndices.has(idx) (chat) => !summarizedMemos.has(chat.memo)
); );
// Insert the memory system prompt at the beginning // Insert the memory system prompt at the beginning