feat: Implement knowledge import feature for characters
- Added KnowledgeImport page for importing character knowledge from URLs. - Integrated URL validation and error handling for unsupported websites. - Created API endpoints for importing content from URLs and retrieving character knowledge. - Enhanced VectorStoreService with logging and error handling for vector memory storage. - Updated frontend to display knowledge sources and manage them effectively. - Added support for fetching recent character knowledge as a fallback in similarity searches. - Updated OpenAPI documentation to reflect new import functionality.
This commit is contained in:
14
apps/backend/jest.config.js
Normal file
14
apps/backend/jest.config.js
Normal file
@@ -0,0 +1,14 @@
|
||||
module.exports = {
|
||||
moduleFileExtensions: ['js', 'json', 'ts'],
|
||||
rootDir: 'src',
|
||||
testRegex: '.*\\.spec\\.ts$',
|
||||
transform: {
|
||||
'^.+\\.(t|j)s$': 'ts-jest',
|
||||
},
|
||||
collectCoverageFrom: ['**/*.(t|j)s'],
|
||||
coverageDirectory: '../coverage',
|
||||
testEnvironment: 'node',
|
||||
moduleNameMapper: {
|
||||
'^@dreamchat/shared$': '<rootDir>/../../packages/shared/dist',
|
||||
},
|
||||
};
|
||||
@@ -4,7 +4,9 @@
|
||||
"scripts": {
|
||||
"build": "nest build",
|
||||
"dev": "nest start --watch",
|
||||
"start": "node dist/main",
|
||||
"start": "node --max-old-space-size=768 dist/main",
|
||||
"start:low-memory": "node --max-old-space-size=384 dist/main",
|
||||
"start:high-memory": "node --max-old-space-size=1536 dist/main",
|
||||
"test": "jest",
|
||||
"test:watch": "jest --watch",
|
||||
"lint": "eslint \"{src,apps,libs,test}/**/*.ts\"",
|
||||
@@ -46,6 +48,7 @@
|
||||
"@nestjs/cli": "^11.0.16",
|
||||
"@nestjs/testing": "^11.1.14",
|
||||
"@types/bcrypt": "^6.0.0",
|
||||
"@types/jest": "^30.0.0",
|
||||
"@types/jsonwebtoken": "^9.0.0",
|
||||
"@types/multer": "^1.4.12",
|
||||
"@types/node": "^24.10.13",
|
||||
@@ -53,6 +56,7 @@
|
||||
"@types/passport-local": "^1.0.0",
|
||||
"jest": "^30.2.0",
|
||||
"prisma": "^7.4.1",
|
||||
"ts-jest": "^29.4.6",
|
||||
"typescript": "^5.3.0"
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Module } from '@nestjs/common';
|
||||
import { Module, NestModule, MiddlewareConsumer } from '@nestjs/common';
|
||||
import { APP_GUARD } from '@nestjs/core';
|
||||
import { PrismaModule } from './prisma/prisma.module';
|
||||
import { AuthModule } from './auth/auth.module';
|
||||
@@ -9,6 +9,7 @@ import { VectorModule } from './vector/vector.module';
|
||||
import { ChatModule } from './chat/chat.module';
|
||||
import { ImportModule } from './import/import.module';
|
||||
import { JwtAuthGuard } from './auth/guards/jwt-auth.guard';
|
||||
import { RequestLoggerMiddleware } from './common/middleware';
|
||||
|
||||
@Module({
|
||||
imports: [
|
||||
@@ -29,4 +30,10 @@ import { JwtAuthGuard } from './auth/guards/jwt-auth.guard';
|
||||
},
|
||||
],
|
||||
})
|
||||
export class AppModule {}
|
||||
export class AppModule implements NestModule {
|
||||
configure(consumer: MiddlewareConsumer) {
|
||||
consumer
|
||||
.apply(RequestLoggerMiddleware)
|
||||
.forRoutes('*');
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ import {
|
||||
OnGatewayDisconnect,
|
||||
} from '@nestjs/websockets';
|
||||
import { Server, Socket } from 'socket.io';
|
||||
import { UseGuards } from '@nestjs/common';
|
||||
import { UseGuards, Logger } from '@nestjs/common';
|
||||
import { ChatService } from './chat.service';
|
||||
import { JwtService } from '@nestjs/jwt';
|
||||
|
||||
@@ -27,6 +27,8 @@ export class ChatGateway implements OnGatewayConnection, OnGatewayDisconnect {
|
||||
@WebSocketServer()
|
||||
server: Server;
|
||||
|
||||
private readonly logger = new Logger(ChatGateway.name);
|
||||
|
||||
constructor(
|
||||
private chatService: ChatService,
|
||||
private jwtService: JwtService,
|
||||
@@ -83,6 +85,8 @@ export class ChatGateway implements OnGatewayConnection, OnGatewayDisconnect {
|
||||
) {
|
||||
if (!client.userId) return;
|
||||
|
||||
this.logger.debug(`[handleSendMessage] Received message from user ${client.userId} for conversation ${data.conversationId}: "${data.content}"`);
|
||||
|
||||
const room = `conversation:${data.conversationId}`;
|
||||
|
||||
try {
|
||||
@@ -119,6 +123,7 @@ export class ChatGateway implements OnGatewayConnection, OnGatewayDisconnect {
|
||||
conversationId: data.conversationId,
|
||||
assistantMessage,
|
||||
});
|
||||
this.logger.debug(`[handleSendMessage] Message streaming completed for conversation ${data.conversationId}`);
|
||||
} catch (error) {
|
||||
client.emit('error', {
|
||||
conversationId: data.conversationId,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Injectable, NotFoundException, ForbiddenException } from '@nestjs/common';
|
||||
import { Injectable, NotFoundException, ForbiddenException, Logger } from '@nestjs/common';
|
||||
import { PrismaService } from '../prisma/prisma.service';
|
||||
import { LLMService } from '../llm/llm.service';
|
||||
import { MemoryService } from '../vector/memory.service';
|
||||
@@ -8,6 +8,8 @@ import { Conversation, Message, MessageRole } from '@prisma/client';
|
||||
|
||||
@Injectable()
|
||||
export class ChatService {
|
||||
private readonly logger = new Logger(ChatService.name);
|
||||
|
||||
constructor(
|
||||
private prisma: PrismaService,
|
||||
private llmService: LLMService,
|
||||
@@ -15,15 +17,9 @@ export class ChatService {
|
||||
private characterService: CharacterService,
|
||||
) {}
|
||||
|
||||
async createConversation(
|
||||
userId: string,
|
||||
createConversationDto: CreateConversationDto,
|
||||
): Promise<Conversation> {
|
||||
async createConversation(userId: string, createConversationDto: CreateConversationDto): Promise<Conversation> {
|
||||
// Verify character exists and user has access
|
||||
const character = await this.characterService.findById(
|
||||
createConversationDto.characterId,
|
||||
userId,
|
||||
);
|
||||
const character = await this.characterService.findById(createConversationDto.characterId, userId);
|
||||
|
||||
return this.prisma.conversation.create({
|
||||
data: {
|
||||
@@ -96,11 +92,7 @@ export class ChatService {
|
||||
});
|
||||
}
|
||||
|
||||
async sendMessage(
|
||||
conversationId: string,
|
||||
userId: string,
|
||||
sendMessageDto: SendMessageDto,
|
||||
): Promise<{ userMessage: Message; assistantMessage: Message }> {
|
||||
async sendMessage(conversationId: string, userId: string, sendMessageDto: SendMessageDto): Promise<{ userMessage: Message; assistantMessage: Message }> {
|
||||
const conversation = await this.findConversationById(conversationId, userId);
|
||||
|
||||
// Create user message
|
||||
@@ -113,25 +105,20 @@ export class ChatService {
|
||||
});
|
||||
|
||||
// Store user message in vector memory
|
||||
await this.memoryService.storeConversationMessage(
|
||||
`User: ${sendMessageDto.content}`,
|
||||
conversationId,
|
||||
{ messageId: userMessage.id },
|
||||
);
|
||||
await this.memoryService.storeConversationMessage(`User: ${sendMessageDto.content}`, conversationId, { messageId: userMessage.id });
|
||||
|
||||
// Generate context from memory
|
||||
const memoryContext = await this.memoryService.buildContextForConversation(
|
||||
conversationId,
|
||||
sendMessageDto.content,
|
||||
conversation.characterId,
|
||||
);
|
||||
const memoryContext = await this.memoryService.buildContextForConversation(conversationId, sendMessageDto.content, conversation.characterId);
|
||||
|
||||
// Build messages for LLM
|
||||
const messages = this.buildLLMMessages(
|
||||
conversation.character.personalityPrompt,
|
||||
conversation.messages,
|
||||
sendMessageDto.content,
|
||||
memoryContext,
|
||||
const messages = this.buildLLMMessages(conversation.character.personalityPrompt, conversation.messages, sendMessageDto.content, memoryContext);
|
||||
|
||||
// Grouped debug logging
|
||||
this.logger.debug(
|
||||
`[sendMessage] conversation=${conversationId}\n` +
|
||||
`--- Knowledges/Context ---\n${memoryContext || '(none)'}\n` +
|
||||
`--- Full Messages to LLM ---\n` +
|
||||
messages.map((msg, idx) => `[${idx}] ${msg.role}: ${msg.content.substring(0, 200)}${msg.content.length > 200 ? '...' : ''}`).join('\n'),
|
||||
);
|
||||
|
||||
// Generate response
|
||||
@@ -161,20 +148,12 @@ export class ChatService {
|
||||
});
|
||||
|
||||
// Store assistant response in vector memory
|
||||
await this.memoryService.storeConversationMessage(
|
||||
`${conversation.character.name}: ${response.content}`,
|
||||
conversationId,
|
||||
{ messageId: assistantMessage.id },
|
||||
);
|
||||
await this.memoryService.storeConversationMessage(`${conversation.character.name}: ${response.content}`, conversationId, { messageId: assistantMessage.id });
|
||||
|
||||
return { userMessage, assistantMessage };
|
||||
}
|
||||
|
||||
async *streamMessage(
|
||||
conversationId: string,
|
||||
userId: string,
|
||||
sendMessageDto: SendMessageDto,
|
||||
): AsyncGenerator<{ type: 'chunk' | 'message'; data: any }> {
|
||||
async *streamMessage(conversationId: string, userId: string, sendMessageDto: SendMessageDto): AsyncGenerator<{ type: 'chunk' | 'message'; data: any }> {
|
||||
const conversation = await this.findConversationById(conversationId, userId);
|
||||
|
||||
// Create user message
|
||||
@@ -189,25 +168,20 @@ export class ChatService {
|
||||
yield { type: 'message', data: { userMessage } };
|
||||
|
||||
// Store user message in vector memory
|
||||
await this.memoryService.storeConversationMessage(
|
||||
`User: ${sendMessageDto.content}`,
|
||||
conversationId,
|
||||
{ messageId: userMessage.id },
|
||||
);
|
||||
await this.memoryService.storeConversationMessage(`User: ${sendMessageDto.content}`, conversationId, { messageId: userMessage.id });
|
||||
|
||||
// Generate context from memory
|
||||
const memoryContext = await this.memoryService.buildContextForConversation(
|
||||
conversationId,
|
||||
sendMessageDto.content,
|
||||
conversation.characterId,
|
||||
);
|
||||
const memoryContext = await this.memoryService.buildContextForConversation(conversationId, sendMessageDto.content, conversation.characterId);
|
||||
|
||||
// Build messages for LLM
|
||||
const messages = this.buildLLMMessages(
|
||||
conversation.character.personalityPrompt,
|
||||
conversation.messages,
|
||||
sendMessageDto.content,
|
||||
memoryContext,
|
||||
const messages = this.buildLLMMessages(conversation.character.personalityPrompt, conversation.messages, sendMessageDto.content, memoryContext);
|
||||
|
||||
// Grouped debug logging
|
||||
this.logger.debug(
|
||||
`[streamMessage] conversation=${conversationId}\n` +
|
||||
`--- Knowledges/Context ---\n${memoryContext || '(none)'}\n` +
|
||||
`--- Full Messages to LLM ---\n` +
|
||||
messages.map((msg, idx) => `[${idx}] ${msg.role}: ${msg.content.substring(0, 200)}${msg.content.length > 200 ? '...' : ''}`).join('\n'),
|
||||
);
|
||||
|
||||
// Generate streaming response
|
||||
@@ -233,10 +207,7 @@ export class ChatService {
|
||||
});
|
||||
|
||||
// Update conversation stats
|
||||
const tokensUsed = this.llmService.countTokens([
|
||||
...messages,
|
||||
{ role: 'assistant', content: fullContent },
|
||||
]);
|
||||
const tokensUsed = this.llmService.countTokens([...messages, { role: 'assistant', content: fullContent }]);
|
||||
|
||||
await this.prisma.conversation.update({
|
||||
where: { id: conversationId },
|
||||
@@ -247,11 +218,7 @@ export class ChatService {
|
||||
});
|
||||
|
||||
// Store assistant response in vector memory
|
||||
await this.memoryService.storeConversationMessage(
|
||||
`${conversation.character.name}: ${fullContent}`,
|
||||
conversationId,
|
||||
{ messageId: assistantMessage.id },
|
||||
);
|
||||
await this.memoryService.storeConversationMessage(`${conversation.character.name}: ${fullContent}`, conversationId, { messageId: assistantMessage.id });
|
||||
|
||||
yield { type: 'message', data: { assistantMessage } };
|
||||
}
|
||||
@@ -265,7 +232,10 @@ export class ChatService {
|
||||
const messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }> = [];
|
||||
|
||||
// Add system message with personality and context
|
||||
let systemContent = personalityPrompt;
|
||||
let systemContent = `You are now role playing as the character based on the following personality description. You should use this information to inform your responses and stay in character. Always try to stay in character and provide responses that align with the personality and history provided. You are now talking to a user. The user will say something, and you will respond as the character. Remember this is a conversation, keep it talking like and maintain your character\n\n`;
|
||||
if (personalityPrompt) {
|
||||
systemContent = personalityPrompt;
|
||||
}
|
||||
if (memoryContext) {
|
||||
systemContent += `\n\nUse the following context to inform your responses:\n${memoryContext}`;
|
||||
}
|
||||
|
||||
1
apps/backend/src/common/middleware/index.ts
Normal file
1
apps/backend/src/common/middleware/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export { RequestLoggerMiddleware } from './request-logger.middleware';
|
||||
@@ -0,0 +1,62 @@
|
||||
import { Injectable, NestMiddleware, Logger } from '@nestjs/common';
|
||||
import { Request, Response, NextFunction } from 'express';
|
||||
|
||||
@Injectable()
|
||||
export class RequestLoggerMiddleware implements NestMiddleware {
|
||||
private logger = new Logger('HTTP');
|
||||
private readonly isEnabled: boolean;
|
||||
private readonly logLevel: 'verbose' | 'standard' | 'minimal';
|
||||
|
||||
constructor() {
|
||||
this.isEnabled = process.env.REQUEST_LOGGER !== 'false';
|
||||
const level = process.env.REQUEST_LOGGER_LEVEL;
|
||||
this.logLevel = level === 'verbose' || level === 'minimal' ? level : 'standard';
|
||||
}
|
||||
|
||||
use(req: Request, res: Response, next: NextFunction) {
|
||||
if (!this.isEnabled) {
|
||||
return next();
|
||||
}
|
||||
|
||||
const { method, originalUrl, ip, headers } = req;
|
||||
const userAgent = headers['user-agent'] || 'unknown';
|
||||
const startTime = Date.now();
|
||||
|
||||
// Log request start (verbose only)
|
||||
if (this.logLevel === 'verbose') {
|
||||
this.logger.log(`${method} ${originalUrl} - ${ip} - ${userAgent}`);
|
||||
}
|
||||
|
||||
// Capture response finish
|
||||
res.on('finish', () => {
|
||||
const duration = Date.now() - startTime;
|
||||
const statusCode = res.statusCode;
|
||||
const contentLength = res.get('content-length') || 0;
|
||||
|
||||
// Build message based on log level
|
||||
let message: string;
|
||||
switch (this.logLevel) {
|
||||
case 'verbose':
|
||||
message = `${method} ${originalUrl} ${statusCode} - ${duration}ms - ${contentLength}b - ${ip}`;
|
||||
break;
|
||||
case 'minimal':
|
||||
message = `${method} ${originalUrl} ${statusCode}`;
|
||||
break;
|
||||
default: // standard
|
||||
message = `${method} ${originalUrl} ${statusCode} - ${duration}ms`;
|
||||
break;
|
||||
}
|
||||
|
||||
// Determine log level based on status code
|
||||
if (statusCode >= 500) {
|
||||
this.logger.error(message);
|
||||
} else if (statusCode >= 400) {
|
||||
this.logger.warn(message);
|
||||
} else {
|
||||
this.logger.log(message);
|
||||
}
|
||||
});
|
||||
|
||||
next();
|
||||
}
|
||||
}
|
||||
@@ -4,9 +4,11 @@ import {
|
||||
Get,
|
||||
Delete,
|
||||
Param,
|
||||
Body,
|
||||
UploadedFile,
|
||||
UseInterceptors,
|
||||
BadRequestException,
|
||||
Logger,
|
||||
} from '@nestjs/common';
|
||||
import { FileInterceptor } from '@nestjs/platform-express';
|
||||
import { ApiTags, ApiOperation, ApiResponse, ApiBearerAuth, ApiParam, ApiConsumes, ApiBody, ApiProperty } from '@nestjs/swagger';
|
||||
@@ -22,10 +24,17 @@ class UploadResponseDto {
|
||||
message: string;
|
||||
}
|
||||
|
||||
class ImportUrlDto {
|
||||
@ApiProperty({ description: 'URL to import', example: 'https://sakurazaka46.com/s/s46/diary/detail/68008' })
|
||||
url: string;
|
||||
}
|
||||
|
||||
@ApiTags('import')
|
||||
@ApiBearerAuth()
|
||||
@Controller('import')
|
||||
export class ImportController {
|
||||
private readonly logger = new Logger(ImportController.name);
|
||||
|
||||
constructor(private importService: ImportService) {}
|
||||
|
||||
@Post('characters/:characterId/files')
|
||||
@@ -98,4 +107,28 @@ export class ImportController {
|
||||
await this.importService.deleteKnowledge(knowledgeId, userId);
|
||||
return { message: 'Knowledge deleted successfully' };
|
||||
}
|
||||
|
||||
@Post('characters/:characterId/url')
|
||||
@ApiOperation({ summary: 'Import content from URL for character knowledge' })
|
||||
@ApiParam({ name: 'characterId', description: 'Character ID' })
|
||||
@ApiBody({ type: ImportUrlDto })
|
||||
@ApiResponse({ status: 201, description: 'URL content is being imported and processed', type: UploadResponseDto })
|
||||
@ApiResponse({ status: 400, description: 'Invalid URL or unsupported website' })
|
||||
@ApiResponse({ status: 401, description: 'Unauthorized' })
|
||||
async importFromUrl(
|
||||
@Param('characterId') characterId: string,
|
||||
@Body() importUrlDto: ImportUrlDto,
|
||||
@CurrentUser('userId') userId: string,
|
||||
): Promise<UploadResponseDto> {
|
||||
this.logger.log(`Received URL import request for character: ${characterId}, url: ${importUrlDto.url}`);
|
||||
|
||||
if (!importUrlDto.url) {
|
||||
this.logger.warn('URL import request rejected: URL is required');
|
||||
throw new BadRequestException('URL is required');
|
||||
}
|
||||
|
||||
const result = await this.importService.importFromUrl(importUrlDto.url, characterId, userId);
|
||||
this.logger.log(`URL import started, knowledgeId: ${result.knowledgeId}`);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +1,23 @@
|
||||
import { Injectable, BadRequestException } from '@nestjs/common';
|
||||
import { Injectable, BadRequestException, Logger } from '@nestjs/common';
|
||||
import { PrismaService } from '../prisma/prisma.service';
|
||||
import { MemoryService } from '../vector/memory.service';
|
||||
import { TextFileAdapter } from './adapters/text-file.adapter';
|
||||
import { IImportAdapter, ImportResult } from './interfaces/import-adapter.interface';
|
||||
import { ImportStatus } from '@prisma/client';
|
||||
import { IWebScraper, WebScraperResult } from './interfaces/web-scraper.interface';
|
||||
import { SakurazakaScraper } from './scrapers/sakurazaka-scraper';
|
||||
|
||||
@Injectable()
|
||||
export class ImportService {
|
||||
private readonly logger = new Logger(ImportService.name);
|
||||
private adapters: IImportAdapter[];
|
||||
private scrapers: IWebScraper[];
|
||||
|
||||
constructor(
|
||||
private prisma: PrismaService,
|
||||
private memoryService: MemoryService,
|
||||
) {
|
||||
this.adapters = [new TextFileAdapter()];
|
||||
this.scrapers = [new SakurazakaScraper()];
|
||||
}
|
||||
|
||||
async uploadFile(
|
||||
@@ -33,6 +37,14 @@ export class ImportService {
|
||||
// Parse the file
|
||||
const result = await adapter.parse(file);
|
||||
|
||||
// Reject if content is too large
|
||||
const maxRawContentLength = 100000;
|
||||
if (result.content.length > maxRawContentLength) {
|
||||
throw new BadRequestException(
|
||||
`File too large: ${result.content.length} characters (max: ${maxRawContentLength}). Please use a smaller file.`
|
||||
);
|
||||
}
|
||||
|
||||
// Create knowledge entry
|
||||
const knowledge = await this.prisma.characterKnowledge.create({
|
||||
data: {
|
||||
@@ -49,18 +61,9 @@ export class ImportService {
|
||||
});
|
||||
|
||||
// Process the content in the background
|
||||
this.logger.log(`[${knowledge.id}] Starting background processing for file upload, character: ${characterId}`);
|
||||
this.processContent(knowledge.id, characterId, result).catch((error) => {
|
||||
console.error('Error processing import:', error);
|
||||
this.prisma.characterKnowledge.update({
|
||||
where: { id: knowledge.id },
|
||||
data: {
|
||||
status: 'failed',
|
||||
processingInfo: {
|
||||
...result.metadata,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
},
|
||||
},
|
||||
});
|
||||
this.logger.error(`[${knowledge.id}] Error processing file import:`, error);
|
||||
});
|
||||
|
||||
return {
|
||||
@@ -141,24 +144,86 @@ export class ImportService {
|
||||
characterId: string,
|
||||
result: ImportResult,
|
||||
): Promise<void> {
|
||||
try {
|
||||
// Chunk the content into smaller pieces
|
||||
const chunks = this.chunkContent(result.content, 1000, 200);
|
||||
this.logger.log(`[${knowledgeId}] Starting processContent, content length: ${result.content.length}`);
|
||||
|
||||
// Reduce memory pressure by limiting content size more aggressively
|
||||
const maxContentLength = 15000; // Reduced from 30000
|
||||
let content = result.content;
|
||||
if (content.length > maxContentLength) {
|
||||
this.logger.warn(`[${knowledgeId}] Content truncated from ${content.length} to ${maxContentLength} characters`);
|
||||
content = content.substring(0, maxContentLength) + '\n\n[Content truncated due to size limits]';
|
||||
}
|
||||
|
||||
// Store each chunk in vector memory
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
await this.memoryService.storeCharacterKnowledge(
|
||||
chunks[i],
|
||||
characterId,
|
||||
knowledgeId,
|
||||
{
|
||||
...result.metadata,
|
||||
chunkIndex: i,
|
||||
totalChunks: chunks.length,
|
||||
},
|
||||
);
|
||||
// Process chunks one at a time using generator to minimize memory usage
|
||||
const chunkSize = 800; // Reduced from 1000
|
||||
const overlap = 100; // Reduced from 200
|
||||
const maxChunks = 20; // Reduced from 30
|
||||
|
||||
let processedChunks = 0;
|
||||
let totalChunks = 0;
|
||||
let start = 0;
|
||||
|
||||
try {
|
||||
this.logger.log(`[${knowledgeId}] Processing chunks with streaming approach...`);
|
||||
|
||||
while (start < content.length && processedChunks < maxChunks) {
|
||||
// Calculate chunk boundaries
|
||||
const end = Math.min(start + chunkSize, content.length);
|
||||
let chunkEnd = end;
|
||||
|
||||
// Try to break at a sentence boundary (only if not at end)
|
||||
if (end < content.length) {
|
||||
const chunk = content.slice(start, end);
|
||||
const lastPeriod = chunk.lastIndexOf('.');
|
||||
const lastNewline = chunk.lastIndexOf('\n');
|
||||
const breakPoint = Math.max(lastPeriod, lastNewline);
|
||||
|
||||
if (breakPoint > chunkSize * 0.5) {
|
||||
chunkEnd = start + breakPoint + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract the chunk
|
||||
const chunk = content.slice(start, chunkEnd).trim();
|
||||
|
||||
if (chunk.length > 0) {
|
||||
totalChunks++;
|
||||
|
||||
if (processedChunks < maxChunks) {
|
||||
this.logger.debug(`[${knowledgeId}] Processing chunk ${processedChunks + 1} (${chunk.length} chars)...`);
|
||||
|
||||
try {
|
||||
await this.memoryService.storeCharacterKnowledge(
|
||||
chunk,
|
||||
characterId,
|
||||
knowledgeId,
|
||||
{
|
||||
...result.metadata,
|
||||
chunkIndex: processedChunks,
|
||||
},
|
||||
);
|
||||
processedChunks++;
|
||||
this.logger.debug(`[${knowledgeId}] Chunk ${processedChunks} stored successfully`);
|
||||
} catch (chunkError) {
|
||||
this.logger.error(`[${knowledgeId}] Failed to store chunk ${processedChunks + 1}:`, chunkError);
|
||||
throw chunkError;
|
||||
}
|
||||
|
||||
// Force garbage collection opportunity between chunks
|
||||
if (processedChunks % 2 === 0) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 150));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Move to next chunk position
|
||||
const nextStart = start + (chunkEnd - start) - overlap;
|
||||
if (nextStart <= start) break; // Prevent infinite loop
|
||||
start = nextStart;
|
||||
}
|
||||
|
||||
this.logger.log(`[${knowledgeId}] Processed ${processedChunks}/${totalChunks} chunks, updating status to completed`);
|
||||
|
||||
// Update status to completed
|
||||
await this.prisma.characterKnowledge.update({
|
||||
where: { id: knowledgeId },
|
||||
@@ -166,49 +231,219 @@ export class ImportService {
|
||||
status: 'completed',
|
||||
processingInfo: {
|
||||
...result.metadata,
|
||||
chunksProcessed: chunks.length,
|
||||
chunksProcessed: processedChunks,
|
||||
originalChunks: totalChunks,
|
||||
wasTruncated: result.content.length > maxContentLength || totalChunks > maxChunks,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
this.logger.log(`[${knowledgeId}] Processing completed successfully`);
|
||||
} catch (error) {
|
||||
this.logger.error(`[${knowledgeId}] Error in processContent:`, error);
|
||||
|
||||
// Update status to failed
|
||||
await this.prisma.characterKnowledge.update({
|
||||
where: { id: knowledgeId },
|
||||
data: {
|
||||
status: 'failed',
|
||||
processingInfo: {
|
||||
...result.metadata,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
try {
|
||||
await this.prisma.characterKnowledge.update({
|
||||
where: { id: knowledgeId },
|
||||
data: {
|
||||
status: 'failed',
|
||||
processingInfo: {
|
||||
...result.metadata,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
this.logger.log(`[${knowledgeId}] Status updated to failed`);
|
||||
} catch (dbError) {
|
||||
this.logger.error(`[${knowledgeId}] Failed to update status to failed:`, dbError);
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private chunkContent(content: string, chunkSize: number, overlap: number): string[] {
|
||||
const chunks: string[] = [];
|
||||
let start = 0;
|
||||
|
||||
while (start < content.length) {
|
||||
const end = Math.min(start + chunkSize, content.length);
|
||||
let chunk = content.slice(start, end);
|
||||
|
||||
// Try to break at a sentence boundary
|
||||
if (end < content.length) {
|
||||
const lastPeriod = chunk.lastIndexOf('.');
|
||||
const lastNewline = chunk.lastIndexOf('\n');
|
||||
const breakPoint = Math.max(lastPeriod, lastNewline);
|
||||
|
||||
if (breakPoint > chunkSize * 0.5) {
|
||||
chunk = chunk.slice(0, breakPoint + 1);
|
||||
}
|
||||
}
|
||||
|
||||
chunks.push(chunk.trim());
|
||||
start += chunk.length - overlap;
|
||||
async importFromUrl(
|
||||
url: string,
|
||||
characterId: string,
|
||||
userId: string,
|
||||
): Promise<{ knowledgeId: string; message: string }> {
|
||||
// Validate URL format
|
||||
let urlObj: URL;
|
||||
try {
|
||||
urlObj = new URL(url);
|
||||
} catch {
|
||||
throw new BadRequestException('Invalid URL format');
|
||||
}
|
||||
|
||||
return chunks;
|
||||
// Find appropriate scraper
|
||||
const scraper = this.scrapers.find((s) => s.canHandle(url));
|
||||
|
||||
if (!scraper) {
|
||||
throw new BadRequestException(
|
||||
`Unsupported URL: ${urlObj.hostname}. No scraper available for this website.`,
|
||||
);
|
||||
}
|
||||
|
||||
// Scrape the content
|
||||
const result = await scraper.scrape(url);
|
||||
|
||||
// Reject if content is too large
|
||||
const maxRawContentLength = 100000;
|
||||
if (result.content.length > maxRawContentLength) {
|
||||
throw new BadRequestException(
|
||||
`Content too large: ${result.content.length} characters (max: ${maxRawContentLength}). Please use a shorter article.`
|
||||
);
|
||||
}
|
||||
|
||||
// Create knowledge entry
|
||||
const knowledgeName = result.metadata.title || `Import from ${urlObj.hostname}`;
|
||||
const knowledge = await this.prisma.characterKnowledge.create({
|
||||
data: {
|
||||
name: knowledgeName,
|
||||
sourceType: 'url',
|
||||
sourceName: url,
|
||||
mimeType: 'text/html',
|
||||
rawContent: result.content,
|
||||
status: 'processing',
|
||||
processingInfo: result.metadata,
|
||||
characterId,
|
||||
},
|
||||
});
|
||||
|
||||
// Process the content in the background
|
||||
this.logger.log(`[${knowledge.id}] Starting background processing for URL: ${url}, character: ${characterId}`);
|
||||
this.processScrapedContent(knowledge.id, characterId, result).catch((error) => {
|
||||
this.logger.error(`[${knowledge.id}] Error processing URL import:`, error);
|
||||
});
|
||||
|
||||
return {
|
||||
knowledgeId: knowledge.id,
|
||||
message: 'URL content is being imported and processed',
|
||||
};
|
||||
}
|
||||
|
||||
private async processScrapedContent(
|
||||
knowledgeId: string,
|
||||
characterId: string,
|
||||
result: WebScraperResult,
|
||||
): Promise<void> {
|
||||
this.logger.log(`[${knowledgeId}] Starting processScrapedContent, content length: ${result.content.length}`);
|
||||
|
||||
// Reduce memory pressure by limiting content size more aggressively
|
||||
const maxContentLength = 15000; // Reduced from 30000
|
||||
let content = result.content;
|
||||
if (content.length > maxContentLength) {
|
||||
this.logger.warn(`[${knowledgeId}] Content truncated from ${content.length} to ${maxContentLength} characters`);
|
||||
content = content.substring(0, maxContentLength) + '\n\n[Content truncated due to size limits]';
|
||||
}
|
||||
|
||||
// Process chunks one at a time using streaming approach to minimize memory usage
|
||||
const chunkSize = 800; // Reduced from 1000
|
||||
const overlap = 100; // Reduced from 200
|
||||
const maxChunks = 20; // Reduced from 30
|
||||
|
||||
let processedChunks = 0;
|
||||
let totalChunks = 0;
|
||||
let start = 0;
|
||||
|
||||
try {
|
||||
this.logger.log(`[${knowledgeId}] Processing chunks with streaming approach...`);
|
||||
|
||||
while (start < content.length && processedChunks < maxChunks) {
|
||||
// Calculate chunk boundaries
|
||||
const end = Math.min(start + chunkSize, content.length);
|
||||
let chunkEnd = end;
|
||||
|
||||
// Try to break at a sentence boundary (only if not at end)
|
||||
if (end < content.length) {
|
||||
const chunk = content.slice(start, end);
|
||||
const lastPeriod = chunk.lastIndexOf('.');
|
||||
const lastNewline = chunk.lastIndexOf('\n');
|
||||
const breakPoint = Math.max(lastPeriod, lastNewline);
|
||||
|
||||
if (breakPoint > chunkSize * 0.5) {
|
||||
chunkEnd = start + breakPoint + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract the chunk
|
||||
const chunk = content.slice(start, chunkEnd).trim();
|
||||
|
||||
if (chunk.length > 0) {
|
||||
totalChunks++;
|
||||
|
||||
if (processedChunks < maxChunks) {
|
||||
this.logger.debug(`[${knowledgeId}] Processing chunk ${processedChunks + 1} (${chunk.length} chars)...`);
|
||||
|
||||
try {
|
||||
await this.memoryService.storeCharacterKnowledge(
|
||||
chunk,
|
||||
characterId,
|
||||
knowledgeId,
|
||||
{
|
||||
...result.metadata,
|
||||
chunkIndex: processedChunks,
|
||||
},
|
||||
);
|
||||
processedChunks++;
|
||||
this.logger.debug(`[${knowledgeId}] Chunk ${processedChunks} stored successfully`);
|
||||
} catch (chunkError) {
|
||||
this.logger.error(`[${knowledgeId}] Failed to store chunk ${processedChunks + 1}:`, chunkError);
|
||||
throw chunkError;
|
||||
}
|
||||
|
||||
// Force garbage collection opportunity between chunks
|
||||
if (processedChunks % 2 === 0) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 150));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Move to next chunk position
|
||||
const nextStart = start + (chunkEnd - start) - overlap;
|
||||
if (nextStart <= start) break; // Prevent infinite loop
|
||||
start = nextStart;
|
||||
}
|
||||
|
||||
this.logger.log(`[${knowledgeId}] Processed ${processedChunks}/${totalChunks} chunks, updating status to completed`);
|
||||
|
||||
// Update status to completed
|
||||
await this.prisma.characterKnowledge.update({
|
||||
where: { id: knowledgeId },
|
||||
data: {
|
||||
status: 'completed',
|
||||
processingInfo: {
|
||||
...result.metadata,
|
||||
chunksProcessed: processedChunks,
|
||||
originalChunks: totalChunks,
|
||||
wasTruncated: result.content.length > maxContentLength || totalChunks > maxChunks,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
this.logger.log(`[${knowledgeId}] Processing completed successfully`);
|
||||
} catch (error) {
|
||||
this.logger.error(`[${knowledgeId}] Error in processScrapedContent:`, error);
|
||||
|
||||
// Update status to failed
|
||||
try {
|
||||
await this.prisma.characterKnowledge.update({
|
||||
where: { id: knowledgeId },
|
||||
data: {
|
||||
status: 'failed',
|
||||
processingInfo: {
|
||||
...result.metadata,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
},
|
||||
},
|
||||
});
|
||||
this.logger.log(`[${knowledgeId}] Status updated to failed`);
|
||||
} catch (dbError) {
|
||||
this.logger.error(`[${knowledgeId}] Failed to update status to failed:`, dbError);
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
2
apps/backend/src/import/interfaces/index.ts
Normal file
2
apps/backend/src/import/interfaces/index.ts
Normal file
@@ -0,0 +1,2 @@
|
||||
export { IImportAdapter, ImportResult } from './import-adapter.interface';
|
||||
export { IWebScraper, WebScraperResult } from './web-scraper.interface';
|
||||
23
apps/backend/src/import/interfaces/web-scraper.interface.ts
Normal file
23
apps/backend/src/import/interfaces/web-scraper.interface.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
export interface WebScraperResult {
|
||||
content: string;
|
||||
metadata: {
|
||||
sourceName: string;
|
||||
url: string;
|
||||
title?: string;
|
||||
author?: string;
|
||||
publishedDate?: string;
|
||||
[key: string]: any;
|
||||
};
|
||||
}
|
||||
|
||||
export interface IWebScraper {
|
||||
/**
|
||||
* Check if this scraper can handle the given URL
|
||||
*/
|
||||
canHandle(url: string): boolean;
|
||||
|
||||
/**
|
||||
* Scrape content from the URL
|
||||
*/
|
||||
scrape(url: string): Promise<WebScraperResult>;
|
||||
}
|
||||
1
apps/backend/src/import/scrapers/index.ts
Normal file
1
apps/backend/src/import/scrapers/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export { SakurazakaScraper } from './sakurazaka-scraper';
|
||||
337
apps/backend/src/import/scrapers/sakurazaka-scraper.spec.ts
Normal file
337
apps/backend/src/import/scrapers/sakurazaka-scraper.spec.ts
Normal file
@@ -0,0 +1,337 @@
|
||||
import { SakurazakaScraper } from './sakurazaka-scraper';
|
||||
|
||||
describe('SakurazakaScraper', () => {
|
||||
let scraper: SakurazakaScraper;
|
||||
|
||||
beforeEach(() => {
|
||||
scraper = new SakurazakaScraper();
|
||||
});
|
||||
|
||||
describe('canHandle', () => {
|
||||
it('should return true for sakurazaka46.com URLs', () => {
|
||||
const urls = [
|
||||
'https://sakurazaka46.com/s/s46/diary/detail/68008',
|
||||
'https://sakurazaka46.com/s/s46/diary/detail/68008?ima=0000&cd=blog',
|
||||
'https://www.sakurazaka46.com/s/s46/diary/detail/10000',
|
||||
'https://sakurazaka46.com/s/s46/diary/blog/list',
|
||||
];
|
||||
|
||||
urls.forEach((url) => {
|
||||
expect(scraper.canHandle(url)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for non-sakurazaka46.com URLs', () => {
|
||||
const urls = [
|
||||
'https://example.com/blog/123',
|
||||
'https://nogizaka46.com/s/n46/diary/detail/12345',
|
||||
'https://hinatazaka46.com/s/h46/diary/detail/12345',
|
||||
'https://sakurazaka46.net/fake/blog/123',
|
||||
'not-a-url',
|
||||
'',
|
||||
];
|
||||
|
||||
urls.forEach((url) => {
|
||||
expect(scraper.canHandle(url)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
it('should return false for invalid URLs', () => {
|
||||
expect(scraper.canHandle('invalid-url')).toBe(false);
|
||||
expect(scraper.canHandle('')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('scrape', () => {
|
||||
const mockBlogHtml = `
|
||||
<article class="post wovn-ignore">
|
||||
<div class="col-l widfix-sp">
|
||||
<div class="com-calendindinav">
|
||||
<div class="wrap-bg">
|
||||
<div class="inner">
|
||||
<div class="year-month">
|
||||
<div class="ym-inner wf-a">
|
||||
<div class="ym-txt">
|
||||
<span class="ym-year">2026</span>
|
||||
<span class="ym-month">2</span>
|
||||
</div>
|
||||
<p class="date wf-a">18</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-r widfix-sp">
|
||||
<div class="inner title-wrap"><h1 class="title">The growing up train</h1></div>
|
||||
</div>
|
||||
<div class="col-l eigo-wrap pc">
|
||||
<div class="eigo-inner">
|
||||
<p class="eigo wf-a">YU MURAI</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-r">
|
||||
<div class="box-article">
|
||||
<p><br/><br/>こんばんは<br/><br/>14thシングル<br/><br/>嬉しいです</p>
|
||||
<p>四期生は肝が据わっています!<br/></p>
|
||||
<img src="/files/14/diary/s46/blog/moblog/202602/mobpqiCQR.jpg"/>
|
||||
<p>またね〜<br/>村井優</p>
|
||||
</div>
|
||||
<div class="blog-foot-nav">
|
||||
<div class="com-btn-lcr">
|
||||
<p class="btn-type1s"><a href="/s/s46/diary/detail/67798">前へ</a></p>
|
||||
<p class="btn-type3"><a href="/s/s46/diary/blog/list?ct=67">村井 優のブログ一覧</a></p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="app-button">
|
||||
<div class="box-a">
|
||||
<p class="app-title">櫻坂46メッセージ</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</article>
|
||||
`;
|
||||
|
||||
beforeEach(() => {
|
||||
global.fetch = jest.fn();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
jest.resetAllMocks();
|
||||
});
|
||||
|
||||
it('should successfully scrape blog content', async () => {
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockBlogHtml),
|
||||
});
|
||||
|
||||
const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008');
|
||||
|
||||
expect(result.content).toContain('Title: The growing up train');
|
||||
expect(result.content).toContain('Author: YU MURAI');
|
||||
expect(result.content).toContain('Date: 2026-02-18');
|
||||
expect(result.content).toContain('こんばんは');
|
||||
expect(result.content).toContain('[Image: mobpqiCQR.jpg]');
|
||||
expect(result.content).toContain('またね〜');
|
||||
expect(result.content).toContain('村井優');
|
||||
});
|
||||
|
||||
it('should extract correct metadata', async () => {
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockBlogHtml),
|
||||
});
|
||||
|
||||
const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008');
|
||||
|
||||
expect(result.metadata).toEqual({
|
||||
sourceName: 'Sakurazaka46 Blog',
|
||||
url: 'https://sakurazaka46.com/s/s46/diary/detail/68008',
|
||||
title: 'The growing up train',
|
||||
author: 'YU MURAI',
|
||||
publishedDate: '2026-02-18',
|
||||
blogId: '68008',
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle blog posts without title gracefully', async () => {
|
||||
const htmlWithoutTitle = mockBlogHtml.replace(
|
||||
/<h1 class="title">[^<]*<\/h1>/,
|
||||
''
|
||||
);
|
||||
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(htmlWithoutTitle),
|
||||
});
|
||||
|
||||
const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008');
|
||||
|
||||
expect(result.metadata.title).toBeFalsy();
|
||||
});
|
||||
|
||||
it('should handle blog posts without author gracefully', async () => {
|
||||
const htmlWithoutAuthor = mockBlogHtml.replace(
|
||||
/<p class="eigo wf-a">[^<]*<\/p>/,
|
||||
''
|
||||
);
|
||||
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(htmlWithoutAuthor),
|
||||
});
|
||||
|
||||
const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008');
|
||||
|
||||
expect(result.metadata.author).toBeFalsy();
|
||||
});
|
||||
|
||||
it('should handle HTTP errors', async () => {
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: false,
|
||||
status: 404,
|
||||
statusText: 'Not Found',
|
||||
});
|
||||
|
||||
await expect(
|
||||
scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/99999')
|
||||
).rejects.toThrow('Failed to fetch page: 404 Not Found');
|
||||
});
|
||||
|
||||
it('should handle network errors', async () => {
|
||||
(global.fetch as jest.Mock).mockRejectedValueOnce(new Error('Network error'));
|
||||
|
||||
await expect(
|
||||
scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008')
|
||||
).rejects.toThrow('Failed to scrape Sakurazaka46 blog: Network error');
|
||||
});
|
||||
|
||||
it('should handle missing article element', async () => {
|
||||
const htmlWithoutArticle = '<div>No article here</div>';
|
||||
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(htmlWithoutArticle),
|
||||
});
|
||||
|
||||
await expect(
|
||||
scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008')
|
||||
).rejects.toThrow('Failed to scrape Sakurazaka46 blog: Could not find article content in the page');
|
||||
});
|
||||
|
||||
it('should handle blog ID extraction from various URL formats', async () => {
|
||||
(global.fetch as jest.Mock).mockResolvedValue({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockBlogHtml),
|
||||
});
|
||||
|
||||
const testCases = [
|
||||
{ url: 'https://sakurazaka46.com/s/s46/diary/detail/68008', expectedId: '68008' },
|
||||
{ url: 'https://sakurazaka46.com/s/s46/diary/detail/68008?ima=0000&cd=blog', expectedId: '68008' },
|
||||
{ url: 'https://sakurazaka46.com/s/s46/diary/detail/12345#anchor', expectedId: '12345' },
|
||||
];
|
||||
|
||||
for (const { url, expectedId } of testCases) {
|
||||
const result = await scraper.scrape(url);
|
||||
expect(result.metadata.blogId).toBe(expectedId);
|
||||
}
|
||||
});
|
||||
|
||||
it('should clean up HTML entities and tags properly', async () => {
|
||||
const htmlWithEntities = `
|
||||
<article>
|
||||
<h1 class="title">Test & Example</h1>
|
||||
<div class="box-article">
|
||||
<p>Hello world <script>alert(1)</script></p>
|
||||
<p>Line 1<br/>Line 2</p>
|
||||
</div>
|
||||
</article>
|
||||
`;
|
||||
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(htmlWithEntities),
|
||||
});
|
||||
|
||||
const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/1');
|
||||
|
||||
expect(result.content).toContain('Test & Example');
|
||||
expect(result.content).toContain('Hello world');
|
||||
expect(result.content).not.toContain(' ');
|
||||
expect(result.content).not.toContain('<script>');
|
||||
expect(result.content).not.toContain('<br/>');
|
||||
});
|
||||
|
||||
it('should remove navigation and footer sections', async () => {
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockBlogHtml),
|
||||
});
|
||||
|
||||
const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008');
|
||||
|
||||
// Navigation elements should not be in the content
|
||||
expect(result.content).not.toContain('前へ');
|
||||
expect(result.content).not.toContain('村井 優のブログ一覧');
|
||||
expect(result.content).not.toContain('櫻坂46メッセージ');
|
||||
expect(result.content).not.toContain('blog-foot-nav');
|
||||
expect(result.content).not.toContain('app-button');
|
||||
});
|
||||
|
||||
it('should handle multiple images in blog post', async () => {
|
||||
const htmlWithMultipleImages = `
|
||||
<article>
|
||||
<h1 class="title">Multi Image Post</h1>
|
||||
<div class="box-article">
|
||||
<img src="/files/image1.jpg"/>
|
||||
<p>Text between images</p>
|
||||
<img src="/files/image2.png"/>
|
||||
<img src="/files/image3.gif"/>
|
||||
<p>After images text</p>
|
||||
</div>
|
||||
</article>
|
||||
`;
|
||||
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(htmlWithMultipleImages),
|
||||
});
|
||||
|
||||
const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/1');
|
||||
|
||||
expect(result.content).toContain('[Image: image1.jpg]');
|
||||
expect(result.content).toContain('[Image: image2.png]');
|
||||
expect(result.content).toContain('[Image: image3.gif]');
|
||||
expect(result.content).toContain('Text between images');
|
||||
expect(result.content).toContain('After images text');
|
||||
});
|
||||
|
||||
it('should extract all images from complex blog structure', async () => {
|
||||
const htmlWithComplexImages = `
|
||||
<article class="post">
|
||||
<div class="col-r">
|
||||
<div class="box-article">
|
||||
<p><img src="/files/14/diary/blog1.jpg"/></p>
|
||||
<p>First paragraph</p>
|
||||
<p><img src="/files/14/diary/blog2.jpg"/><br/><img src="/files/14/diary/blog3.jpg"/></p>
|
||||
<p>Second paragraph with <img src="/files/14/diary/blog4.jpg"/> inline image</p>
|
||||
</div>
|
||||
</div>
|
||||
</article>
|
||||
`;
|
||||
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(htmlWithComplexImages),
|
||||
});
|
||||
|
||||
const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/1');
|
||||
|
||||
expect(result.content).toContain('[Image: blog1.jpg]');
|
||||
expect(result.content).toContain('[Image: blog2.jpg]');
|
||||
expect(result.content).toContain('[Image: blog3.jpg]');
|
||||
expect(result.content).toContain('[Image: blog4.jpg]');
|
||||
expect(result.content).toContain('First paragraph');
|
||||
expect(result.content).toContain('Second paragraph');
|
||||
});
|
||||
|
||||
it('should set correct User-Agent header', async () => {
|
||||
(global.fetch as jest.Mock).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockBlogHtml),
|
||||
});
|
||||
|
||||
await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008');
|
||||
|
||||
expect(global.fetch).toHaveBeenCalledWith(
|
||||
'https://sakurazaka46.com/s/s46/diary/detail/68008',
|
||||
expect.objectContaining({
|
||||
headers: expect.objectContaining({
|
||||
'User-Agent': expect.stringContaining('Mozilla/5.0'),
|
||||
}),
|
||||
})
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
238
apps/backend/src/import/scrapers/sakurazaka-scraper.ts
Normal file
238
apps/backend/src/import/scrapers/sakurazaka-scraper.ts
Normal file
@@ -0,0 +1,238 @@
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { IWebScraper, WebScraperResult } from '../interfaces/web-scraper.interface';
|
||||
|
||||
/**
|
||||
* Web scraper for Sakurazaka46 blog posts
|
||||
* Handles URLs like: https://sakurazaka46.com/s/s46/diary/detail/{id}
|
||||
*/
|
||||
export class SakurazakaScraper implements IWebScraper {
|
||||
private readonly logger = new Logger(SakurazakaScraper.name);
|
||||
private readonly baseUrl = 'sakurazaka46.com';
|
||||
|
||||
canHandle(url: string): boolean {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return (
|
||||
urlObj.hostname === this.baseUrl ||
|
||||
urlObj.hostname === `www.${this.baseUrl}`
|
||||
);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async scrape(url: string): Promise<WebScraperResult> {
|
||||
this.logger.log(`Starting to scrape URL: ${url}`);
|
||||
|
||||
try {
|
||||
// Fetch the page content
|
||||
this.logger.debug(`Fetching page content from: ${url}`);
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
this.logger.error(`Failed to fetch page: ${response.status} ${response.statusText}`);
|
||||
throw new Error(`Failed to fetch page: ${response.status} ${response.statusText}`);
|
||||
}
|
||||
|
||||
this.logger.debug(`Page fetched successfully, reading content...`);
|
||||
const html = await response.text();
|
||||
this.logger.debug(`HTML content received: ${html.length} bytes`);
|
||||
|
||||
// Extract content
|
||||
this.logger.debug('Extracting content from HTML...');
|
||||
const content = this.extractContent(html);
|
||||
this.logger.log(`Content extracted: ${content.length} characters`);
|
||||
|
||||
const metadata = this.extractMetadata(html, url);
|
||||
this.logger.log(`Metadata extracted:`, metadata);
|
||||
|
||||
return {
|
||||
content,
|
||||
metadata,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to scrape Sakurazaka46 blog:`, error);
|
||||
throw new Error(
|
||||
`Failed to scrape Sakurazaka46 blog: ${error instanceof Error ? error.message : 'Unknown error'}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private extractContent(html: string): string {
|
||||
// Find the article element
|
||||
const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
|
||||
if (!articleMatch) {
|
||||
throw new Error('Could not find article content in the page');
|
||||
}
|
||||
|
||||
const article = articleMatch[0];
|
||||
const parts: string[] = [];
|
||||
|
||||
// Extract title
|
||||
const titleMatch = article.match(/<h1[^>]*class="[^"]*title[^"]*"[^>]*>([\s\S]*?)<\/h1>/i);
|
||||
if (titleMatch) {
|
||||
const title = this.stripHtml(titleMatch[1]).trim();
|
||||
if (title) parts.push(`Title: ${title}`);
|
||||
}
|
||||
|
||||
// Extract author
|
||||
const authorMatch = article.match(/<p[^>]*class="[^"]*eigo[^"]*"[^>]*>([\s\S]*?)<\/p>/i);
|
||||
if (authorMatch) {
|
||||
const author = this.stripHtml(authorMatch[1]).trim();
|
||||
if (author) parts.push(`Author: ${author}`);
|
||||
}
|
||||
|
||||
// Extract date from the calendar structure
|
||||
const yearMatch = article.match(/<span[^>]*class="[^"]*ym-year[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||
const monthMatch = article.match(/<span[^>]*class="[^"]*ym-month[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||
const dayMatch = article.match(/<p[^>]*class="[^"]*date[^"]*"[^>]*>([\s\S]*?)<\/p>/i);
|
||||
|
||||
if (yearMatch && monthMatch && dayMatch) {
|
||||
const year = this.stripHtml(yearMatch[1]).trim();
|
||||
const month = this.stripHtml(monthMatch[1]).trim();
|
||||
const day = this.stripHtml(dayMatch[1]).trim();
|
||||
parts.push(`Date: ${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}`);
|
||||
}
|
||||
|
||||
// Extract main content from box-article
|
||||
// Find box-article and capture until we hit navigation sections
|
||||
const boxMatch = article.match(/<div[^>]*class="[^"]*box-article[^"]*"[^>]*>([\s\S]*)$/i);
|
||||
if (boxMatch) {
|
||||
let content = boxMatch[1];
|
||||
// Find where navigation starts and cut there
|
||||
const navIndex = content.search(/<div[^>]*class="[^"]*col-l[^"]*"[^>]*>/i);
|
||||
if (navIndex !== -1) {
|
||||
content = content.substring(0, navIndex);
|
||||
}
|
||||
// Also cut at app-button section
|
||||
const appIndex = content.search(/<div[^>]*class="[^"]*app-button[^"]*"/i);
|
||||
if (appIndex !== -1) {
|
||||
content = content.substring(0, appIndex);
|
||||
}
|
||||
const mainContent = this.cleanBlogContent(content);
|
||||
if (mainContent) {
|
||||
parts.push('');
|
||||
parts.push(mainContent);
|
||||
}
|
||||
}
|
||||
|
||||
return parts.join('\n');
|
||||
}
|
||||
|
||||
private extractMetadata(html: string, url: string): WebScraperResult['metadata'] {
|
||||
const metadata: WebScraperResult['metadata'] = {
|
||||
sourceName: 'Sakurazaka46 Blog',
|
||||
url,
|
||||
};
|
||||
|
||||
// Extract title
|
||||
const titleMatch = html.match(/<h1[^>]*class="[^"]*title[^"]*"[^>]*>([\s\S]*?)<\/h1>/i);
|
||||
if (titleMatch) {
|
||||
metadata.title = this.stripHtml(titleMatch[1]).trim();
|
||||
}
|
||||
|
||||
// Extract author
|
||||
const authorMatch = html.match(/<p[^>]*class="[^"]*eigo[^"]*"[^>]*>([\s\S]*?)<\/p>/i);
|
||||
if (authorMatch) {
|
||||
metadata.author = this.stripHtml(authorMatch[1]).trim();
|
||||
}
|
||||
|
||||
// Extract date
|
||||
const yearMatch = html.match(/<span[^>]*class="[^"]*ym-year[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||
const monthMatch = html.match(/<span[^>]*class="[^"]*ym-month[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
|
||||
const dayMatch = html.match(/<p[^>]*class="[^"]*date[^"]*"[^>]*>([\s\S]*?)<\/p>/i);
|
||||
|
||||
if (yearMatch && monthMatch && dayMatch) {
|
||||
const year = this.stripHtml(yearMatch[1]).trim();
|
||||
const month = this.stripHtml(monthMatch[1]).trim().padStart(2, '0');
|
||||
const day = this.stripHtml(dayMatch[1]).trim().padStart(2, '0');
|
||||
metadata.publishedDate = `${year}-${month}-${day}`;
|
||||
}
|
||||
|
||||
// Extract blog ID from URL
|
||||
const idMatch = url.match(/detail\/(\d+)/);
|
||||
if (idMatch) {
|
||||
metadata.blogId = idMatch[1];
|
||||
}
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
private stripHtml(html: string): string {
|
||||
return html
|
||||
.replace(/<br\s*\/?>/gi, '\n')
|
||||
.replace(/<\/p>/gi, '\n')
|
||||
.replace(/<[^>]+>/g, '')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.trim();
|
||||
}
|
||||
|
||||
private cleanBlogContent(html: string): string {
|
||||
// Remove HTML comments
|
||||
html = html.replace(/<!--[\s\S]*?-->/g, '');
|
||||
|
||||
// Remove blog navigation/footer section
|
||||
html = html.replace(/<div[^>]*class="[^"]*blog-foot-nav[^"]*"[\s\S]*$/i, '');
|
||||
|
||||
// Remove app button section
|
||||
html = html.replace(/<div[^>]*class="[^"]*app-button[^"]*"[\s\S]*$/i, '');
|
||||
|
||||
// Remove script and style tags
|
||||
let cleaned = html
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
|
||||
|
||||
// Convert <br> tags to newlines
|
||||
cleaned = cleaned.replace(/<br\s*\/?>/gi, '\n');
|
||||
|
||||
// Handle image tags - convert to descriptive text
|
||||
cleaned = cleaned.replace(/<img[^>]*src="([^"]*)"[^>]*>/gi, (match, src) => {
|
||||
// Extract filename from src
|
||||
const filename = src.split('/').pop();
|
||||
return `\n[Image: ${filename}]\n`;
|
||||
});
|
||||
|
||||
// Convert paragraph closings to double newlines
|
||||
cleaned = cleaned.replace(/<\/p>/gi, '\n\n');
|
||||
|
||||
// Convert div closings to single newlines
|
||||
cleaned = cleaned.replace(/<\/div>/gi, '\n');
|
||||
|
||||
// Remove remaining HTML tags
|
||||
cleaned = cleaned.replace(/<[^>]+>/g, '');
|
||||
|
||||
// Decode HTML entities
|
||||
cleaned = cleaned
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'");
|
||||
|
||||
// Clean up excessive whitespace
|
||||
cleaned = cleaned
|
||||
.replace(/\n{4,}/g, '\n\n\n')
|
||||
.trim();
|
||||
|
||||
// Remove trailing empty image references
|
||||
cleaned = cleaned.replace(/\n*\[Image:[^\]]*\]\s*$/g, '');
|
||||
|
||||
// Remove trailing author/date section if present
|
||||
cleaned = cleaned.replace(/\s+村井\s*優\s*\d{4}\/\d{2}\/\d{2}\s*\d{2}:\d{2}\s*$/g, '');
|
||||
|
||||
// Final trim
|
||||
cleaned = cleaned.trim();
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,15 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { IEmbeddingProvider } from './interfaces/embedding-provider.interface';
|
||||
import { LocalEmbeddingProvider } from './providers/local-embedding.provider';
|
||||
|
||||
@Injectable()
|
||||
export class EmbeddingService {
|
||||
private readonly logger = new Logger(EmbeddingService.name);
|
||||
private provider: IEmbeddingProvider;
|
||||
|
||||
constructor() {
|
||||
const providerType = process.env.EMBEDDING_PROVIDER || 'local';
|
||||
this.logger.log(`Initializing EmbeddingService with provider: ${providerType}`);
|
||||
|
||||
switch (providerType) {
|
||||
case 'local':
|
||||
@@ -19,7 +21,11 @@ export class EmbeddingService {
|
||||
}
|
||||
|
||||
async embed(text: string): Promise<number[]> {
|
||||
return this.provider.embed(text);
|
||||
this.logger.debug(`Generating embedding for text (${text.length} chars)...`);
|
||||
const startTime = Date.now();
|
||||
const result = await this.provider.embed(text);
|
||||
this.logger.debug(`Embedding generated in ${Date.now() - startTime}ms`);
|
||||
return result;
|
||||
}
|
||||
|
||||
async embedBatch(texts: string[]): Promise<number[][]> {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { EmbeddingService } from './embedding.service';
|
||||
import { VectorStoreService, SearchResult } from './vector-store.service';
|
||||
import { MemoryType } from '@prisma/client';
|
||||
@@ -11,6 +11,8 @@ export interface MemoryContext {
|
||||
|
||||
@Injectable()
|
||||
export class MemoryService {
|
||||
private readonly logger = new Logger(MemoryService.name);
|
||||
|
||||
constructor(
|
||||
private embeddingService: EmbeddingService,
|
||||
private vectorStore: VectorStoreService,
|
||||
@@ -26,8 +28,15 @@ export class MemoryService {
|
||||
metadata?: any;
|
||||
},
|
||||
): Promise<void> {
|
||||
const startTime = Date.now();
|
||||
this.logger.debug(`[${options.knowledgeId || options.conversationId}] Generating embedding for content (${content.length} chars)...`);
|
||||
|
||||
const embedding = await this.embeddingService.embed(content);
|
||||
this.logger.debug(`[${options.knowledgeId || options.conversationId}] Embedding generated in ${Date.now() - startTime}ms, dimension: ${embedding.length}`);
|
||||
|
||||
this.logger.debug(`[${options.knowledgeId || options.conversationId}] Storing in vector store...`);
|
||||
await this.vectorStore.store(content, embedding, memoryType, options);
|
||||
this.logger.debug(`[${options.knowledgeId || options.conversationId}] Stored in vector store in ${Date.now() - startTime}ms`);
|
||||
}
|
||||
|
||||
async retrieveRelevantMemories(
|
||||
@@ -40,9 +49,20 @@ export class MemoryService {
|
||||
memoryType?: MemoryType;
|
||||
},
|
||||
): Promise<MemoryContext[]> {
|
||||
const { limit = 5, threshold = 0.6, conversationId, characterId, memoryType } = options;
|
||||
|
||||
this.logger.debug(
|
||||
`[retrieveRelevantMemories] Query: "${query.substring(0, 100)}...", type: ${memoryType}, characterId: ${characterId}, conversationId: ${conversationId}, threshold: ${threshold}`,
|
||||
);
|
||||
|
||||
const embedding = await this.embeddingService.embed(query);
|
||||
const results = await this.vectorStore.searchSimilar(embedding, options);
|
||||
|
||||
this.logger.debug(`[retrieveRelevantMemories] Found ${results.length} results for type ${memoryType}:`);
|
||||
results.forEach((r, i) => {
|
||||
this.logger.debug(` [${i}] similarity: ${r.similarity.toFixed(4)}, content: "${r.content.substring(0, 80)}..."`);
|
||||
});
|
||||
|
||||
return results.map((result) => ({
|
||||
content: result.content,
|
||||
metadata: result.metadata,
|
||||
@@ -50,37 +70,44 @@ export class MemoryService {
|
||||
}));
|
||||
}
|
||||
|
||||
async buildContextForConversation(
|
||||
conversationId: string,
|
||||
currentMessage: string,
|
||||
characterId: string,
|
||||
): Promise<string> {
|
||||
// Retrieve recent conversation memories
|
||||
const conversationMemories = await this.retrieveRelevantMemories(
|
||||
currentMessage,
|
||||
{
|
||||
limit: 3,
|
||||
threshold: 0.6,
|
||||
conversationId,
|
||||
memoryType: 'conversation',
|
||||
},
|
||||
async buildContextForConversation(conversationId: string, currentMessage: string, characterId: string): Promise<string> {
|
||||
this.logger.debug(
|
||||
`[buildContextForConversation] Building context for conversation ${conversationId}, character ${characterId}, message: "${currentMessage.substring(0, 100)}"`,
|
||||
);
|
||||
|
||||
// Retrieve character knowledge
|
||||
const characterMemories = await this.retrieveRelevantMemories(
|
||||
currentMessage,
|
||||
{
|
||||
limit: 3,
|
||||
threshold: 0.7,
|
||||
characterId,
|
||||
memoryType: 'character',
|
||||
},
|
||||
);
|
||||
// Retrieve recent conversation memories
|
||||
const conversationMemories = await this.retrieveRelevantMemories(currentMessage, {
|
||||
limit: 3,
|
||||
threshold: 0.6,
|
||||
conversationId,
|
||||
memoryType: 'conversation',
|
||||
});
|
||||
|
||||
// Retrieve character knowledge - using multilingual embedding model for cross-lingual support
|
||||
// Lower threshold (0.3) for cross-lingual matching (English query -> Japanese content)
|
||||
let characterMemories = await this.retrieveRelevantMemories(currentMessage, {
|
||||
limit: 3,
|
||||
threshold: 0.3,
|
||||
characterId,
|
||||
memoryType: 'character',
|
||||
});
|
||||
|
||||
// Fallback: If vector search returns no results, retrieve the most recent knowledge
|
||||
// This ensures the character always has some context for roleplaying
|
||||
if (characterMemories.length === 0) {
|
||||
this.logger.debug(`[buildContextForConversation] No similar memories found, falling back to most recent knowledge`);
|
||||
const recentResults = await this.vectorStore.getRecentByCharacterId(characterId, 2);
|
||||
characterMemories = recentResults.map((r) => ({
|
||||
content: r.content,
|
||||
metadata: r.metadata,
|
||||
similarity: r.similarity,
|
||||
}));
|
||||
}
|
||||
|
||||
const contextParts: string[] = [];
|
||||
|
||||
if (characterMemories.length > 0) {
|
||||
contextParts.push('Relevant character knowledge:');
|
||||
contextParts.push('Here are some descriptions of yourself. These knowledge are your history, what you have done, talked, or imagined:');
|
||||
characterMemories.forEach((memory) => {
|
||||
contextParts.push(`- ${memory.content}`);
|
||||
});
|
||||
@@ -93,30 +120,26 @@ export class MemoryService {
|
||||
});
|
||||
}
|
||||
|
||||
return contextParts.join('\n');
|
||||
const result = contextParts.join('\n');
|
||||
this.logger.debug(`[buildContextForConversation] Final context (${result.length} chars):\n${result || '(empty)'}`);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async storeConversationMessage(
|
||||
content: string,
|
||||
conversationId: string,
|
||||
metadata?: any,
|
||||
): Promise<void> {
|
||||
async storeConversationMessage(content: string, conversationId: string, metadata?: any): Promise<void> {
|
||||
await this.addMemory(content, 'conversation', {
|
||||
conversationId,
|
||||
metadata,
|
||||
});
|
||||
}
|
||||
|
||||
async storeCharacterKnowledge(
|
||||
content: string,
|
||||
characterId: string,
|
||||
knowledgeId: string,
|
||||
metadata?: any,
|
||||
): Promise<void> {
|
||||
async storeCharacterKnowledge(content: string, characterId: string, knowledgeId: string, metadata?: any): Promise<void> {
|
||||
this.logger.debug(`[${knowledgeId}] Storing character knowledge chunk for character: ${characterId}`);
|
||||
await this.addMemory(content, 'character', {
|
||||
characterId,
|
||||
knowledgeId,
|
||||
metadata,
|
||||
});
|
||||
this.logger.debug(`[${knowledgeId}] Character knowledge chunk stored successfully`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Injectable, OnModuleInit } from '@nestjs/common';
|
||||
import { Injectable, OnModuleInit, Logger } from '@nestjs/common';
|
||||
import { IEmbeddingProvider } from '../interfaces/embedding-provider.interface';
|
||||
import { pipeline, FeatureExtractionPipeline } from '@xenova/transformers';
|
||||
|
||||
@@ -7,6 +7,8 @@ export class LocalEmbeddingProvider implements IEmbeddingProvider, OnModuleInit
|
||||
private extractor: FeatureExtractionPipeline | null = null;
|
||||
private readonly modelName: string;
|
||||
private readonly dimension: number;
|
||||
private readonly logger = new Logger(LocalEmbeddingProvider.name);
|
||||
private isLoading = false;
|
||||
|
||||
constructor() {
|
||||
this.modelName = process.env.EMBEDDING_MODEL || 'Xenova/all-MiniLM-L6-v2';
|
||||
@@ -15,31 +17,72 @@ export class LocalEmbeddingProvider implements IEmbeddingProvider, OnModuleInit
|
||||
|
||||
async onModuleInit() {
|
||||
// Lazy initialization - model will be loaded on first use
|
||||
this.logger.log(`LocalEmbeddingProvider initialized with model: ${this.modelName}`);
|
||||
}
|
||||
|
||||
private async getExtractor(): Promise<FeatureExtractionPipeline> {
|
||||
if (!this.extractor) {
|
||||
this.extractor = await pipeline('feature-extraction', this.modelName, {
|
||||
quantized: false, // Use full precision for better quality
|
||||
});
|
||||
if (this.isLoading) {
|
||||
// Wait for existing load to complete
|
||||
while (this.isLoading) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
}
|
||||
if (this.extractor) {
|
||||
return this.extractor;
|
||||
}
|
||||
}
|
||||
|
||||
this.isLoading = true;
|
||||
this.logger.log(`Loading embedding model: ${this.modelName}...`);
|
||||
|
||||
try {
|
||||
// Use quantized model to reduce memory usage
|
||||
this.extractor = await pipeline('feature-extraction', this.modelName, {
|
||||
quantized: true,
|
||||
revision: 'main',
|
||||
});
|
||||
this.logger.log('Embedding model loaded successfully');
|
||||
} catch (error) {
|
||||
this.logger.error('Failed to load embedding model:', error);
|
||||
throw error;
|
||||
} finally {
|
||||
this.isLoading = false;
|
||||
}
|
||||
}
|
||||
return this.extractor;
|
||||
}
|
||||
|
||||
async embed(text: string): Promise<number[]> {
|
||||
// Truncate text to prevent excessive memory usage
|
||||
const maxLength = 512; // Maximum tokens the model can handle efficiently
|
||||
const truncatedText = text.length > maxLength * 4 ? text.substring(0, maxLength * 4) : text;
|
||||
|
||||
const extractor = await this.getExtractor();
|
||||
const output = await extractor(text, { pooling: 'mean', normalize: true });
|
||||
return Array.from(output.data as Float32Array);
|
||||
const output = await extractor(truncatedText, { pooling: 'mean', normalize: true });
|
||||
|
||||
// Convert to array - this creates a copy, allowing original to be GC'd
|
||||
const result = Array.from(output.data as Float32Array);
|
||||
|
||||
// Add delay to allow GC between embeddings
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async embedBatch(texts: string[]): Promise<number[][]> {
|
||||
const extractor = await this.getExtractor();
|
||||
const outputs = await Promise.all(
|
||||
texts.map((text) =>
|
||||
extractor(text, { pooling: 'mean', normalize: true }),
|
||||
),
|
||||
);
|
||||
return outputs.map((output) => Array.from(output.data as Float32Array));
|
||||
const results: number[][] = [];
|
||||
|
||||
// Process one at a time to minimize memory spikes
|
||||
for (let i = 0; i < texts.length; i++) {
|
||||
results.push(await this.embed(texts[i]));
|
||||
|
||||
// Add delay every few items to allow GC
|
||||
if (i > 0 && i % 2 === 0) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 50));
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
getDimension(): number {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { PrismaService } from '../prisma/prisma.service';
|
||||
import { MemoryType, VectorMemory } from '@prisma/client';
|
||||
|
||||
@@ -12,6 +12,8 @@ export interface SearchResult {
|
||||
|
||||
@Injectable()
|
||||
export class VectorStoreService {
|
||||
private readonly logger = new Logger(VectorStoreService.name);
|
||||
|
||||
constructor(private prisma: PrismaService) {}
|
||||
|
||||
async store(
|
||||
@@ -25,23 +27,35 @@ export class VectorStoreService {
|
||||
metadata?: any;
|
||||
},
|
||||
): Promise<VectorMemory> {
|
||||
const startTime = Date.now();
|
||||
const vectorString = `[${embedding.join(',')}]`;
|
||||
const metadataJson = options.metadata ? JSON.stringify(options.metadata) : null;
|
||||
|
||||
this.logger.debug(`[${options.knowledgeId}] Storing vector memory, type: ${memoryType}, vector dim: ${embedding.length}`);
|
||||
|
||||
return this.prisma.$queryRaw<VectorMemory[]>`
|
||||
INSERT INTO "VectorMemory" (id, content, embedding, "memoryType", metadata, "conversationId", "characterId", "knowledgeId", "createdAt")
|
||||
VALUES (
|
||||
gen_random_uuid(),
|
||||
${content},
|
||||
${vectorString}::vector,
|
||||
${memoryType},
|
||||
${options.metadata ? JSON.stringify(options.metadata) : null}::jsonb,
|
||||
${options.conversationId || null},
|
||||
${options.characterId || null},
|
||||
${options.knowledgeId || null},
|
||||
NOW()
|
||||
)
|
||||
RETURNING *
|
||||
`.then((results) => results[0]);
|
||||
try {
|
||||
const result = await this.prisma.$queryRaw<VectorMemory[]>`
|
||||
INSERT INTO "VectorMemory" (id, content, embedding, "memoryType", metadata, "conversationId", "characterId", "knowledgeId", "createdAt")
|
||||
VALUES (
|
||||
gen_random_uuid(),
|
||||
${content},
|
||||
${vectorString}::vector,
|
||||
${memoryType},
|
||||
${metadataJson}::jsonb,
|
||||
${options.conversationId || null},
|
||||
${options.characterId || null},
|
||||
${options.knowledgeId || null},
|
||||
NOW()
|
||||
)
|
||||
RETURNING *
|
||||
`.then((results) => results[0]);
|
||||
|
||||
this.logger.debug(`[${options.knowledgeId}] Vector memory stored in ${Date.now() - startTime}ms, id: ${result.id}`);
|
||||
return result;
|
||||
} catch (error) {
|
||||
this.logger.error(`[${options.knowledgeId}] Failed to store vector memory:`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async searchSimilar(
|
||||
@@ -93,7 +107,34 @@ export class VectorStoreService {
|
||||
LIMIT $3
|
||||
`;
|
||||
|
||||
return this.prisma.$queryRawUnsafe<SearchResult[]>(query, ...params);
|
||||
this.logger.debug(`[searchSimilar] Query params: threshold=${threshold}, limit=${limit}, characterId=${options.characterId}, memoryType=${options.memoryType}`);
|
||||
|
||||
const results = await this.prisma.$queryRawUnsafe<SearchResult[]>(query, ...params);
|
||||
|
||||
this.logger.debug(`[searchSimilar] Found ${results.length} results matching criteria`);
|
||||
|
||||
// Debug: Show all similarities for character knowledge
|
||||
if (options.characterId && options.memoryType === 'character') {
|
||||
const allQuery = `
|
||||
SELECT
|
||||
id,
|
||||
content,
|
||||
"memoryType",
|
||||
metadata,
|
||||
1 - (embedding <=> $1::vector) as similarity
|
||||
FROM "VectorMemory"
|
||||
WHERE "characterId" = $2 AND "memoryType" = $3
|
||||
ORDER BY embedding <=> $1::vector
|
||||
LIMIT 10
|
||||
`;
|
||||
const allResults = await this.prisma.$queryRawUnsafe<SearchResult[]>(allQuery, vectorString, options.characterId, options.memoryType);
|
||||
this.logger.debug(`[searchSimilar] All ${allResults.length} similarities for character ${options.characterId}:`);
|
||||
allResults.forEach((r, i) => {
|
||||
this.logger.debug(` [${i}] similarity=${r.similarity.toFixed(4)}, content="${r.content.substring(0, 50)}..."`);
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async deleteByConversation(conversationId: string): Promise<void> {
|
||||
@@ -113,4 +154,28 @@ export class VectorStoreService {
|
||||
where: { knowledgeId },
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve most recent character knowledge (fallback when similarity search returns nothing)
|
||||
*/
|
||||
async getRecentByCharacterId(characterId: string, limit: number = 2): Promise<SearchResult[]> {
|
||||
this.logger.debug(`[getRecentByCharacterId] Retrieving recent knowledge for character ${characterId}`);
|
||||
|
||||
const results = await this.prisma.$queryRaw<SearchResult[]>`
|
||||
SELECT
|
||||
id,
|
||||
content,
|
||||
"memoryType",
|
||||
metadata,
|
||||
1.0 as similarity
|
||||
FROM "VectorMemory"
|
||||
WHERE "characterId" = ${characterId} AND "memoryType" = 'character'
|
||||
ORDER BY "createdAt" DESC
|
||||
LIMIT ${limit}
|
||||
`;
|
||||
|
||||
this.logger.debug(`[getRecentByCharacterId] Found ${results.length} recent memories`);
|
||||
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user