From e033d67ec1d688bd3a32c041f4843c4e22f3d685 Mon Sep 17 00:00:00 2001 From: GW_MC <72297530+GWMCwing@users.noreply.github.com> Date: Tue, 24 Feb 2026 14:29:26 +0000 Subject: [PATCH] feat: Implement knowledge import feature for characters - Added KnowledgeImport page for importing character knowledge from URLs. - Integrated URL validation and error handling for unsupported websites. - Created API endpoints for importing content from URLs and retrieving character knowledge. - Enhanced VectorStoreService with logging and error handling for vector memory storage. - Updated frontend to display knowledge sources and manage them effectively. - Added support for fetching recent character knowledge as a fallback in similarity searches. - Updated OpenAPI documentation to reflect new import functionality. --- .env.example | 18 + README.md | 39 ++ apps/backend/jest.config.js | 14 + apps/backend/package.json | 6 +- apps/backend/src/app.module.ts | 11 +- apps/backend/src/chat/chat.gateway.ts | 7 +- apps/backend/src/chat/chat.service.ts | 98 ++--- apps/backend/src/common/middleware/index.ts | 1 + .../middleware/request-logger.middleware.ts | 62 +++ apps/backend/src/import/import.controller.ts | 33 ++ apps/backend/src/import/import.service.ts | 355 +++++++++++++--- apps/backend/src/import/interfaces/index.ts | 2 + .../interfaces/web-scraper.interface.ts | 23 ++ apps/backend/src/import/scrapers/index.ts | 1 + .../scrapers/sakurazaka-scraper.spec.ts | 337 +++++++++++++++ .../src/import/scrapers/sakurazaka-scraper.ts | 238 +++++++++++ apps/backend/src/vector/embedding.service.ts | 10 +- apps/backend/src/vector/memory.service.ts | 99 +++-- .../providers/local-embedding.provider.ts | 69 +++- .../src/vector/vector-store.service.ts | 99 ++++- apps/frontend/src/App.tsx | 10 + .../src/api/generated/import/import.ts | 30 +- .../src/api/generated/model/importUrlDto.ts | 12 + .../frontend/src/api/generated/model/index.ts | 2 + .../api/generated/model/uploadResponseDto.ts | 14 + apps/frontend/src/pages/CharacterForm.tsx | 30 ++ apps/frontend/src/pages/Chat.tsx | 42 +- apps/frontend/src/pages/KnowledgeImport.tsx | 383 ++++++++++++++++++ openapi/openapi.json | 83 ++++ pnpm-lock.yaml | 94 +++++ 30 files changed, 2018 insertions(+), 204 deletions(-) create mode 100644 apps/backend/jest.config.js create mode 100644 apps/backend/src/common/middleware/index.ts create mode 100644 apps/backend/src/common/middleware/request-logger.middleware.ts create mode 100644 apps/backend/src/import/interfaces/index.ts create mode 100644 apps/backend/src/import/interfaces/web-scraper.interface.ts create mode 100644 apps/backend/src/import/scrapers/index.ts create mode 100644 apps/backend/src/import/scrapers/sakurazaka-scraper.spec.ts create mode 100644 apps/backend/src/import/scrapers/sakurazaka-scraper.ts create mode 100644 apps/frontend/src/api/generated/model/importUrlDto.ts create mode 100644 apps/frontend/src/api/generated/model/uploadResponseDto.ts create mode 100644 apps/frontend/src/pages/KnowledgeImport.tsx diff --git a/.env.example b/.env.example index a03df2b..7d06742 100644 --- a/.env.example +++ b/.env.example @@ -20,6 +20,24 @@ EMBEDDING_MODEL=Xenova/all-MiniLM-L6-v2 EMBEDDING_DIMENSION=384 EMBEDDING_DEVICE=cpu +# Use quantized model for lower memory usage (~4x smaller, slightly less accurate) +# Set to 'true' for low-memory systems (512MB-1GB RAM) +EMBEDDING_QUANTIZED=false + +# Node.js Memory Configuration (increase if embedding causes OOM) +# For 512MB RAM VPS: NODE_OPTIONS=--max-old-space-size=384 +# For 1GB RAM VPS: NODE_OPTIONS=--max-old-space-size=768 +# For 2GB RAM VPS: NODE_OPTIONS=--max-old-space-size=1536 +# Default (no env var): Node uses ~4GB or system limit +#NODE_OPTIONS=--max-old-space-size=768 + +# Request Logging Configuration +# Enable/disable request logging (default: true) +#REQUEST_LOGGER=true + +# Log level: verbose (detailed), standard (default), minimal (status only) +#REQUEST_LOGGER_LEVEL=standard + # HuggingFace API (optional - if not using local embeddings) # HUGGINGFACE_API_KEY=hf_... diff --git a/README.md b/README.md index d2f75f9..76e2a93 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,45 @@ docker-compose exec backend pnpm db:migrate **Note:** An external reverse proxy (nginx, Traefik, etc.) is expected for SSL termination and routing. See [deployment.md](doc/deployment.md) for configuration examples. +## Troubleshooting + +### Memory Issues (JavaScript heap out of memory) + +The local embedding model uses TensorFlow.js which can consume significant memory. If you encounter OOM errors: + +1. **Increase Node.js memory limit:** + ```bash + # In apps/backend/.env + NODE_OPTIONS=--max-old-space-size=768 + + # Or when starting manually + node --max-old-space-size=768 dist/main + ``` + +2. **Use low-memory mode (for 512MB RAM VPS):** + ```bash + cd apps/backend + pnpm run start:low-memory + ``` + +3. **Use quantized model (4x smaller memory):** + ```bash + # In apps/backend/.env + EMBEDDING_QUANTIZED=true + ``` + +4. **Limit import size:** The system automatically: + - Limits content to ~30KB per import + - Limits to 30 chunks maximum + - Processes chunks sequentially with delays + - Adds GC pauses between chunks + +### System Requirements + +- **Minimum:** 512MB RAM (with low-memory configuration) +- **Recommended:** 1GB+ RAM for comfortable operation +- **Import processing:** Large imports may take longer on low-memory systems + ## License MIT diff --git a/apps/backend/jest.config.js b/apps/backend/jest.config.js new file mode 100644 index 0000000..3088518 --- /dev/null +++ b/apps/backend/jest.config.js @@ -0,0 +1,14 @@ +module.exports = { + moduleFileExtensions: ['js', 'json', 'ts'], + rootDir: 'src', + testRegex: '.*\\.spec\\.ts$', + transform: { + '^.+\\.(t|j)s$': 'ts-jest', + }, + collectCoverageFrom: ['**/*.(t|j)s'], + coverageDirectory: '../coverage', + testEnvironment: 'node', + moduleNameMapper: { + '^@dreamchat/shared$': '/../../packages/shared/dist', + }, +}; diff --git a/apps/backend/package.json b/apps/backend/package.json index 5b6a6bf..d22b79e 100644 --- a/apps/backend/package.json +++ b/apps/backend/package.json @@ -4,7 +4,9 @@ "scripts": { "build": "nest build", "dev": "nest start --watch", - "start": "node dist/main", + "start": "node --max-old-space-size=768 dist/main", + "start:low-memory": "node --max-old-space-size=384 dist/main", + "start:high-memory": "node --max-old-space-size=1536 dist/main", "test": "jest", "test:watch": "jest --watch", "lint": "eslint \"{src,apps,libs,test}/**/*.ts\"", @@ -46,6 +48,7 @@ "@nestjs/cli": "^11.0.16", "@nestjs/testing": "^11.1.14", "@types/bcrypt": "^6.0.0", + "@types/jest": "^30.0.0", "@types/jsonwebtoken": "^9.0.0", "@types/multer": "^1.4.12", "@types/node": "^24.10.13", @@ -53,6 +56,7 @@ "@types/passport-local": "^1.0.0", "jest": "^30.2.0", "prisma": "^7.4.1", + "ts-jest": "^29.4.6", "typescript": "^5.3.0" } } \ No newline at end of file diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts index 21ca7db..0b6f976 100644 --- a/apps/backend/src/app.module.ts +++ b/apps/backend/src/app.module.ts @@ -1,4 +1,4 @@ -import { Module } from '@nestjs/common'; +import { Module, NestModule, MiddlewareConsumer } from '@nestjs/common'; import { APP_GUARD } from '@nestjs/core'; import { PrismaModule } from './prisma/prisma.module'; import { AuthModule } from './auth/auth.module'; @@ -9,6 +9,7 @@ import { VectorModule } from './vector/vector.module'; import { ChatModule } from './chat/chat.module'; import { ImportModule } from './import/import.module'; import { JwtAuthGuard } from './auth/guards/jwt-auth.guard'; +import { RequestLoggerMiddleware } from './common/middleware'; @Module({ imports: [ @@ -29,4 +30,10 @@ import { JwtAuthGuard } from './auth/guards/jwt-auth.guard'; }, ], }) -export class AppModule {} +export class AppModule implements NestModule { + configure(consumer: MiddlewareConsumer) { + consumer + .apply(RequestLoggerMiddleware) + .forRoutes('*'); + } +} diff --git a/apps/backend/src/chat/chat.gateway.ts b/apps/backend/src/chat/chat.gateway.ts index e006ad4..8527d2c 100644 --- a/apps/backend/src/chat/chat.gateway.ts +++ b/apps/backend/src/chat/chat.gateway.ts @@ -8,7 +8,7 @@ import { OnGatewayDisconnect, } from '@nestjs/websockets'; import { Server, Socket } from 'socket.io'; -import { UseGuards } from '@nestjs/common'; +import { UseGuards, Logger } from '@nestjs/common'; import { ChatService } from './chat.service'; import { JwtService } from '@nestjs/jwt'; @@ -27,6 +27,8 @@ export class ChatGateway implements OnGatewayConnection, OnGatewayDisconnect { @WebSocketServer() server: Server; + private readonly logger = new Logger(ChatGateway.name); + constructor( private chatService: ChatService, private jwtService: JwtService, @@ -83,6 +85,8 @@ export class ChatGateway implements OnGatewayConnection, OnGatewayDisconnect { ) { if (!client.userId) return; + this.logger.debug(`[handleSendMessage] Received message from user ${client.userId} for conversation ${data.conversationId}: "${data.content}"`); + const room = `conversation:${data.conversationId}`; try { @@ -119,6 +123,7 @@ export class ChatGateway implements OnGatewayConnection, OnGatewayDisconnect { conversationId: data.conversationId, assistantMessage, }); + this.logger.debug(`[handleSendMessage] Message streaming completed for conversation ${data.conversationId}`); } catch (error) { client.emit('error', { conversationId: data.conversationId, diff --git a/apps/backend/src/chat/chat.service.ts b/apps/backend/src/chat/chat.service.ts index 59997ea..a675310 100644 --- a/apps/backend/src/chat/chat.service.ts +++ b/apps/backend/src/chat/chat.service.ts @@ -1,4 +1,4 @@ -import { Injectable, NotFoundException, ForbiddenException } from '@nestjs/common'; +import { Injectable, NotFoundException, ForbiddenException, Logger } from '@nestjs/common'; import { PrismaService } from '../prisma/prisma.service'; import { LLMService } from '../llm/llm.service'; import { MemoryService } from '../vector/memory.service'; @@ -8,6 +8,8 @@ import { Conversation, Message, MessageRole } from '@prisma/client'; @Injectable() export class ChatService { + private readonly logger = new Logger(ChatService.name); + constructor( private prisma: PrismaService, private llmService: LLMService, @@ -15,15 +17,9 @@ export class ChatService { private characterService: CharacterService, ) {} - async createConversation( - userId: string, - createConversationDto: CreateConversationDto, - ): Promise { + async createConversation(userId: string, createConversationDto: CreateConversationDto): Promise { // Verify character exists and user has access - const character = await this.characterService.findById( - createConversationDto.characterId, - userId, - ); + const character = await this.characterService.findById(createConversationDto.characterId, userId); return this.prisma.conversation.create({ data: { @@ -96,11 +92,7 @@ export class ChatService { }); } - async sendMessage( - conversationId: string, - userId: string, - sendMessageDto: SendMessageDto, - ): Promise<{ userMessage: Message; assistantMessage: Message }> { + async sendMessage(conversationId: string, userId: string, sendMessageDto: SendMessageDto): Promise<{ userMessage: Message; assistantMessage: Message }> { const conversation = await this.findConversationById(conversationId, userId); // Create user message @@ -113,25 +105,20 @@ export class ChatService { }); // Store user message in vector memory - await this.memoryService.storeConversationMessage( - `User: ${sendMessageDto.content}`, - conversationId, - { messageId: userMessage.id }, - ); + await this.memoryService.storeConversationMessage(`User: ${sendMessageDto.content}`, conversationId, { messageId: userMessage.id }); // Generate context from memory - const memoryContext = await this.memoryService.buildContextForConversation( - conversationId, - sendMessageDto.content, - conversation.characterId, - ); + const memoryContext = await this.memoryService.buildContextForConversation(conversationId, sendMessageDto.content, conversation.characterId); // Build messages for LLM - const messages = this.buildLLMMessages( - conversation.character.personalityPrompt, - conversation.messages, - sendMessageDto.content, - memoryContext, + const messages = this.buildLLMMessages(conversation.character.personalityPrompt, conversation.messages, sendMessageDto.content, memoryContext); + + // Grouped debug logging + this.logger.debug( + `[sendMessage] conversation=${conversationId}\n` + + `--- Knowledges/Context ---\n${memoryContext || '(none)'}\n` + + `--- Full Messages to LLM ---\n` + + messages.map((msg, idx) => `[${idx}] ${msg.role}: ${msg.content.substring(0, 200)}${msg.content.length > 200 ? '...' : ''}`).join('\n'), ); // Generate response @@ -161,20 +148,12 @@ export class ChatService { }); // Store assistant response in vector memory - await this.memoryService.storeConversationMessage( - `${conversation.character.name}: ${response.content}`, - conversationId, - { messageId: assistantMessage.id }, - ); + await this.memoryService.storeConversationMessage(`${conversation.character.name}: ${response.content}`, conversationId, { messageId: assistantMessage.id }); return { userMessage, assistantMessage }; } - async *streamMessage( - conversationId: string, - userId: string, - sendMessageDto: SendMessageDto, - ): AsyncGenerator<{ type: 'chunk' | 'message'; data: any }> { + async *streamMessage(conversationId: string, userId: string, sendMessageDto: SendMessageDto): AsyncGenerator<{ type: 'chunk' | 'message'; data: any }> { const conversation = await this.findConversationById(conversationId, userId); // Create user message @@ -189,25 +168,20 @@ export class ChatService { yield { type: 'message', data: { userMessage } }; // Store user message in vector memory - await this.memoryService.storeConversationMessage( - `User: ${sendMessageDto.content}`, - conversationId, - { messageId: userMessage.id }, - ); + await this.memoryService.storeConversationMessage(`User: ${sendMessageDto.content}`, conversationId, { messageId: userMessage.id }); // Generate context from memory - const memoryContext = await this.memoryService.buildContextForConversation( - conversationId, - sendMessageDto.content, - conversation.characterId, - ); + const memoryContext = await this.memoryService.buildContextForConversation(conversationId, sendMessageDto.content, conversation.characterId); // Build messages for LLM - const messages = this.buildLLMMessages( - conversation.character.personalityPrompt, - conversation.messages, - sendMessageDto.content, - memoryContext, + const messages = this.buildLLMMessages(conversation.character.personalityPrompt, conversation.messages, sendMessageDto.content, memoryContext); + + // Grouped debug logging + this.logger.debug( + `[streamMessage] conversation=${conversationId}\n` + + `--- Knowledges/Context ---\n${memoryContext || '(none)'}\n` + + `--- Full Messages to LLM ---\n` + + messages.map((msg, idx) => `[${idx}] ${msg.role}: ${msg.content.substring(0, 200)}${msg.content.length > 200 ? '...' : ''}`).join('\n'), ); // Generate streaming response @@ -233,10 +207,7 @@ export class ChatService { }); // Update conversation stats - const tokensUsed = this.llmService.countTokens([ - ...messages, - { role: 'assistant', content: fullContent }, - ]); + const tokensUsed = this.llmService.countTokens([...messages, { role: 'assistant', content: fullContent }]); await this.prisma.conversation.update({ where: { id: conversationId }, @@ -247,11 +218,7 @@ export class ChatService { }); // Store assistant response in vector memory - await this.memoryService.storeConversationMessage( - `${conversation.character.name}: ${fullContent}`, - conversationId, - { messageId: assistantMessage.id }, - ); + await this.memoryService.storeConversationMessage(`${conversation.character.name}: ${fullContent}`, conversationId, { messageId: assistantMessage.id }); yield { type: 'message', data: { assistantMessage } }; } @@ -265,7 +232,10 @@ export class ChatService { const messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }> = []; // Add system message with personality and context - let systemContent = personalityPrompt; + let systemContent = `You are now role playing as the character based on the following personality description. You should use this information to inform your responses and stay in character. Always try to stay in character and provide responses that align with the personality and history provided. You are now talking to a user. The user will say something, and you will respond as the character. Remember this is a conversation, keep it talking like and maintain your character\n\n`; + if (personalityPrompt) { + systemContent = personalityPrompt; + } if (memoryContext) { systemContent += `\n\nUse the following context to inform your responses:\n${memoryContext}`; } diff --git a/apps/backend/src/common/middleware/index.ts b/apps/backend/src/common/middleware/index.ts new file mode 100644 index 0000000..2d56842 --- /dev/null +++ b/apps/backend/src/common/middleware/index.ts @@ -0,0 +1 @@ +export { RequestLoggerMiddleware } from './request-logger.middleware'; diff --git a/apps/backend/src/common/middleware/request-logger.middleware.ts b/apps/backend/src/common/middleware/request-logger.middleware.ts new file mode 100644 index 0000000..6ac55e7 --- /dev/null +++ b/apps/backend/src/common/middleware/request-logger.middleware.ts @@ -0,0 +1,62 @@ +import { Injectable, NestMiddleware, Logger } from '@nestjs/common'; +import { Request, Response, NextFunction } from 'express'; + +@Injectable() +export class RequestLoggerMiddleware implements NestMiddleware { + private logger = new Logger('HTTP'); + private readonly isEnabled: boolean; + private readonly logLevel: 'verbose' | 'standard' | 'minimal'; + + constructor() { + this.isEnabled = process.env.REQUEST_LOGGER !== 'false'; + const level = process.env.REQUEST_LOGGER_LEVEL; + this.logLevel = level === 'verbose' || level === 'minimal' ? level : 'standard'; + } + + use(req: Request, res: Response, next: NextFunction) { + if (!this.isEnabled) { + return next(); + } + + const { method, originalUrl, ip, headers } = req; + const userAgent = headers['user-agent'] || 'unknown'; + const startTime = Date.now(); + + // Log request start (verbose only) + if (this.logLevel === 'verbose') { + this.logger.log(`${method} ${originalUrl} - ${ip} - ${userAgent}`); + } + + // Capture response finish + res.on('finish', () => { + const duration = Date.now() - startTime; + const statusCode = res.statusCode; + const contentLength = res.get('content-length') || 0; + + // Build message based on log level + let message: string; + switch (this.logLevel) { + case 'verbose': + message = `${method} ${originalUrl} ${statusCode} - ${duration}ms - ${contentLength}b - ${ip}`; + break; + case 'minimal': + message = `${method} ${originalUrl} ${statusCode}`; + break; + default: // standard + message = `${method} ${originalUrl} ${statusCode} - ${duration}ms`; + break; + } + + // Determine log level based on status code + if (statusCode >= 500) { + this.logger.error(message); + } else if (statusCode >= 400) { + this.logger.warn(message); + } else { + this.logger.log(message); + } + }); + + next(); + } +} diff --git a/apps/backend/src/import/import.controller.ts b/apps/backend/src/import/import.controller.ts index 88b26e8..bdedb9a 100644 --- a/apps/backend/src/import/import.controller.ts +++ b/apps/backend/src/import/import.controller.ts @@ -4,9 +4,11 @@ import { Get, Delete, Param, + Body, UploadedFile, UseInterceptors, BadRequestException, + Logger, } from '@nestjs/common'; import { FileInterceptor } from '@nestjs/platform-express'; import { ApiTags, ApiOperation, ApiResponse, ApiBearerAuth, ApiParam, ApiConsumes, ApiBody, ApiProperty } from '@nestjs/swagger'; @@ -22,10 +24,17 @@ class UploadResponseDto { message: string; } +class ImportUrlDto { + @ApiProperty({ description: 'URL to import', example: 'https://sakurazaka46.com/s/s46/diary/detail/68008' }) + url: string; +} + @ApiTags('import') @ApiBearerAuth() @Controller('import') export class ImportController { + private readonly logger = new Logger(ImportController.name); + constructor(private importService: ImportService) {} @Post('characters/:characterId/files') @@ -98,4 +107,28 @@ export class ImportController { await this.importService.deleteKnowledge(knowledgeId, userId); return { message: 'Knowledge deleted successfully' }; } + + @Post('characters/:characterId/url') + @ApiOperation({ summary: 'Import content from URL for character knowledge' }) + @ApiParam({ name: 'characterId', description: 'Character ID' }) + @ApiBody({ type: ImportUrlDto }) + @ApiResponse({ status: 201, description: 'URL content is being imported and processed', type: UploadResponseDto }) + @ApiResponse({ status: 400, description: 'Invalid URL or unsupported website' }) + @ApiResponse({ status: 401, description: 'Unauthorized' }) + async importFromUrl( + @Param('characterId') characterId: string, + @Body() importUrlDto: ImportUrlDto, + @CurrentUser('userId') userId: string, + ): Promise { + this.logger.log(`Received URL import request for character: ${characterId}, url: ${importUrlDto.url}`); + + if (!importUrlDto.url) { + this.logger.warn('URL import request rejected: URL is required'); + throw new BadRequestException('URL is required'); + } + + const result = await this.importService.importFromUrl(importUrlDto.url, characterId, userId); + this.logger.log(`URL import started, knowledgeId: ${result.knowledgeId}`); + return result; + } } diff --git a/apps/backend/src/import/import.service.ts b/apps/backend/src/import/import.service.ts index 605243f..4ae3d4f 100644 --- a/apps/backend/src/import/import.service.ts +++ b/apps/backend/src/import/import.service.ts @@ -1,19 +1,23 @@ -import { Injectable, BadRequestException } from '@nestjs/common'; +import { Injectable, BadRequestException, Logger } from '@nestjs/common'; import { PrismaService } from '../prisma/prisma.service'; import { MemoryService } from '../vector/memory.service'; import { TextFileAdapter } from './adapters/text-file.adapter'; import { IImportAdapter, ImportResult } from './interfaces/import-adapter.interface'; -import { ImportStatus } from '@prisma/client'; +import { IWebScraper, WebScraperResult } from './interfaces/web-scraper.interface'; +import { SakurazakaScraper } from './scrapers/sakurazaka-scraper'; @Injectable() export class ImportService { + private readonly logger = new Logger(ImportService.name); private adapters: IImportAdapter[]; + private scrapers: IWebScraper[]; constructor( private prisma: PrismaService, private memoryService: MemoryService, ) { this.adapters = [new TextFileAdapter()]; + this.scrapers = [new SakurazakaScraper()]; } async uploadFile( @@ -33,6 +37,14 @@ export class ImportService { // Parse the file const result = await adapter.parse(file); + // Reject if content is too large + const maxRawContentLength = 100000; + if (result.content.length > maxRawContentLength) { + throw new BadRequestException( + `File too large: ${result.content.length} characters (max: ${maxRawContentLength}). Please use a smaller file.` + ); + } + // Create knowledge entry const knowledge = await this.prisma.characterKnowledge.create({ data: { @@ -49,18 +61,9 @@ export class ImportService { }); // Process the content in the background + this.logger.log(`[${knowledge.id}] Starting background processing for file upload, character: ${characterId}`); this.processContent(knowledge.id, characterId, result).catch((error) => { - console.error('Error processing import:', error); - this.prisma.characterKnowledge.update({ - where: { id: knowledge.id }, - data: { - status: 'failed', - processingInfo: { - ...result.metadata, - error: error instanceof Error ? error.message : 'Unknown error', - }, - }, - }); + this.logger.error(`[${knowledge.id}] Error processing file import:`, error); }); return { @@ -141,24 +144,86 @@ export class ImportService { characterId: string, result: ImportResult, ): Promise { - try { - // Chunk the content into smaller pieces - const chunks = this.chunkContent(result.content, 1000, 200); + this.logger.log(`[${knowledgeId}] Starting processContent, content length: ${result.content.length}`); + + // Reduce memory pressure by limiting content size more aggressively + const maxContentLength = 15000; // Reduced from 30000 + let content = result.content; + if (content.length > maxContentLength) { + this.logger.warn(`[${knowledgeId}] Content truncated from ${content.length} to ${maxContentLength} characters`); + content = content.substring(0, maxContentLength) + '\n\n[Content truncated due to size limits]'; + } - // Store each chunk in vector memory - for (let i = 0; i < chunks.length; i++) { - await this.memoryService.storeCharacterKnowledge( - chunks[i], - characterId, - knowledgeId, - { - ...result.metadata, - chunkIndex: i, - totalChunks: chunks.length, - }, - ); + // Process chunks one at a time using generator to minimize memory usage + const chunkSize = 800; // Reduced from 1000 + const overlap = 100; // Reduced from 200 + const maxChunks = 20; // Reduced from 30 + + let processedChunks = 0; + let totalChunks = 0; + let start = 0; + + try { + this.logger.log(`[${knowledgeId}] Processing chunks with streaming approach...`); + + while (start < content.length && processedChunks < maxChunks) { + // Calculate chunk boundaries + const end = Math.min(start + chunkSize, content.length); + let chunkEnd = end; + + // Try to break at a sentence boundary (only if not at end) + if (end < content.length) { + const chunk = content.slice(start, end); + const lastPeriod = chunk.lastIndexOf('.'); + const lastNewline = chunk.lastIndexOf('\n'); + const breakPoint = Math.max(lastPeriod, lastNewline); + + if (breakPoint > chunkSize * 0.5) { + chunkEnd = start + breakPoint + 1; + } + } + + // Extract the chunk + const chunk = content.slice(start, chunkEnd).trim(); + + if (chunk.length > 0) { + totalChunks++; + + if (processedChunks < maxChunks) { + this.logger.debug(`[${knowledgeId}] Processing chunk ${processedChunks + 1} (${chunk.length} chars)...`); + + try { + await this.memoryService.storeCharacterKnowledge( + chunk, + characterId, + knowledgeId, + { + ...result.metadata, + chunkIndex: processedChunks, + }, + ); + processedChunks++; + this.logger.debug(`[${knowledgeId}] Chunk ${processedChunks} stored successfully`); + } catch (chunkError) { + this.logger.error(`[${knowledgeId}] Failed to store chunk ${processedChunks + 1}:`, chunkError); + throw chunkError; + } + + // Force garbage collection opportunity between chunks + if (processedChunks % 2 === 0) { + await new Promise((resolve) => setTimeout(resolve, 150)); + } + } + } + + // Move to next chunk position + const nextStart = start + (chunkEnd - start) - overlap; + if (nextStart <= start) break; // Prevent infinite loop + start = nextStart; } + this.logger.log(`[${knowledgeId}] Processed ${processedChunks}/${totalChunks} chunks, updating status to completed`); + // Update status to completed await this.prisma.characterKnowledge.update({ where: { id: knowledgeId }, @@ -166,49 +231,219 @@ export class ImportService { status: 'completed', processingInfo: { ...result.metadata, - chunksProcessed: chunks.length, + chunksProcessed: processedChunks, + originalChunks: totalChunks, + wasTruncated: result.content.length > maxContentLength || totalChunks > maxChunks, }, }, }); + + this.logger.log(`[${knowledgeId}] Processing completed successfully`); } catch (error) { + this.logger.error(`[${knowledgeId}] Error in processContent:`, error); + // Update status to failed - await this.prisma.characterKnowledge.update({ - where: { id: knowledgeId }, - data: { - status: 'failed', - processingInfo: { - ...result.metadata, - error: error instanceof Error ? error.message : 'Unknown error', + try { + await this.prisma.characterKnowledge.update({ + where: { id: knowledgeId }, + data: { + status: 'failed', + processingInfo: { + ...result.metadata, + error: error instanceof Error ? error.message : 'Unknown error', + }, }, - }, - }); + }); + this.logger.log(`[${knowledgeId}] Status updated to failed`); + } catch (dbError) { + this.logger.error(`[${knowledgeId}] Failed to update status to failed:`, dbError); + } + throw error; } } - private chunkContent(content: string, chunkSize: number, overlap: number): string[] { - const chunks: string[] = []; - let start = 0; - - while (start < content.length) { - const end = Math.min(start + chunkSize, content.length); - let chunk = content.slice(start, end); - - // Try to break at a sentence boundary - if (end < content.length) { - const lastPeriod = chunk.lastIndexOf('.'); - const lastNewline = chunk.lastIndexOf('\n'); - const breakPoint = Math.max(lastPeriod, lastNewline); - - if (breakPoint > chunkSize * 0.5) { - chunk = chunk.slice(0, breakPoint + 1); - } - } - - chunks.push(chunk.trim()); - start += chunk.length - overlap; + async importFromUrl( + url: string, + characterId: string, + userId: string, + ): Promise<{ knowledgeId: string; message: string }> { + // Validate URL format + let urlObj: URL; + try { + urlObj = new URL(url); + } catch { + throw new BadRequestException('Invalid URL format'); } - return chunks; + // Find appropriate scraper + const scraper = this.scrapers.find((s) => s.canHandle(url)); + + if (!scraper) { + throw new BadRequestException( + `Unsupported URL: ${urlObj.hostname}. No scraper available for this website.`, + ); + } + + // Scrape the content + const result = await scraper.scrape(url); + + // Reject if content is too large + const maxRawContentLength = 100000; + if (result.content.length > maxRawContentLength) { + throw new BadRequestException( + `Content too large: ${result.content.length} characters (max: ${maxRawContentLength}). Please use a shorter article.` + ); + } + + // Create knowledge entry + const knowledgeName = result.metadata.title || `Import from ${urlObj.hostname}`; + const knowledge = await this.prisma.characterKnowledge.create({ + data: { + name: knowledgeName, + sourceType: 'url', + sourceName: url, + mimeType: 'text/html', + rawContent: result.content, + status: 'processing', + processingInfo: result.metadata, + characterId, + }, + }); + + // Process the content in the background + this.logger.log(`[${knowledge.id}] Starting background processing for URL: ${url}, character: ${characterId}`); + this.processScrapedContent(knowledge.id, characterId, result).catch((error) => { + this.logger.error(`[${knowledge.id}] Error processing URL import:`, error); + }); + + return { + knowledgeId: knowledge.id, + message: 'URL content is being imported and processed', + }; + } + + private async processScrapedContent( + knowledgeId: string, + characterId: string, + result: WebScraperResult, + ): Promise { + this.logger.log(`[${knowledgeId}] Starting processScrapedContent, content length: ${result.content.length}`); + + // Reduce memory pressure by limiting content size more aggressively + const maxContentLength = 15000; // Reduced from 30000 + let content = result.content; + if (content.length > maxContentLength) { + this.logger.warn(`[${knowledgeId}] Content truncated from ${content.length} to ${maxContentLength} characters`); + content = content.substring(0, maxContentLength) + '\n\n[Content truncated due to size limits]'; + } + + // Process chunks one at a time using streaming approach to minimize memory usage + const chunkSize = 800; // Reduced from 1000 + const overlap = 100; // Reduced from 200 + const maxChunks = 20; // Reduced from 30 + + let processedChunks = 0; + let totalChunks = 0; + let start = 0; + + try { + this.logger.log(`[${knowledgeId}] Processing chunks with streaming approach...`); + + while (start < content.length && processedChunks < maxChunks) { + // Calculate chunk boundaries + const end = Math.min(start + chunkSize, content.length); + let chunkEnd = end; + + // Try to break at a sentence boundary (only if not at end) + if (end < content.length) { + const chunk = content.slice(start, end); + const lastPeriod = chunk.lastIndexOf('.'); + const lastNewline = chunk.lastIndexOf('\n'); + const breakPoint = Math.max(lastPeriod, lastNewline); + + if (breakPoint > chunkSize * 0.5) { + chunkEnd = start + breakPoint + 1; + } + } + + // Extract the chunk + const chunk = content.slice(start, chunkEnd).trim(); + + if (chunk.length > 0) { + totalChunks++; + + if (processedChunks < maxChunks) { + this.logger.debug(`[${knowledgeId}] Processing chunk ${processedChunks + 1} (${chunk.length} chars)...`); + + try { + await this.memoryService.storeCharacterKnowledge( + chunk, + characterId, + knowledgeId, + { + ...result.metadata, + chunkIndex: processedChunks, + }, + ); + processedChunks++; + this.logger.debug(`[${knowledgeId}] Chunk ${processedChunks} stored successfully`); + } catch (chunkError) { + this.logger.error(`[${knowledgeId}] Failed to store chunk ${processedChunks + 1}:`, chunkError); + throw chunkError; + } + + // Force garbage collection opportunity between chunks + if (processedChunks % 2 === 0) { + await new Promise((resolve) => setTimeout(resolve, 150)); + } + } + } + + // Move to next chunk position + const nextStart = start + (chunkEnd - start) - overlap; + if (nextStart <= start) break; // Prevent infinite loop + start = nextStart; + } + + this.logger.log(`[${knowledgeId}] Processed ${processedChunks}/${totalChunks} chunks, updating status to completed`); + + // Update status to completed + await this.prisma.characterKnowledge.update({ + where: { id: knowledgeId }, + data: { + status: 'completed', + processingInfo: { + ...result.metadata, + chunksProcessed: processedChunks, + originalChunks: totalChunks, + wasTruncated: result.content.length > maxContentLength || totalChunks > maxChunks, + }, + }, + }); + + this.logger.log(`[${knowledgeId}] Processing completed successfully`); + } catch (error) { + this.logger.error(`[${knowledgeId}] Error in processScrapedContent:`, error); + + // Update status to failed + try { + await this.prisma.characterKnowledge.update({ + where: { id: knowledgeId }, + data: { + status: 'failed', + processingInfo: { + ...result.metadata, + error: error instanceof Error ? error.message : 'Unknown error', + }, + }, + }); + this.logger.log(`[${knowledgeId}] Status updated to failed`); + } catch (dbError) { + this.logger.error(`[${knowledgeId}] Failed to update status to failed:`, dbError); + } + + throw error; + } } } diff --git a/apps/backend/src/import/interfaces/index.ts b/apps/backend/src/import/interfaces/index.ts new file mode 100644 index 0000000..2d56b4c --- /dev/null +++ b/apps/backend/src/import/interfaces/index.ts @@ -0,0 +1,2 @@ +export { IImportAdapter, ImportResult } from './import-adapter.interface'; +export { IWebScraper, WebScraperResult } from './web-scraper.interface'; diff --git a/apps/backend/src/import/interfaces/web-scraper.interface.ts b/apps/backend/src/import/interfaces/web-scraper.interface.ts new file mode 100644 index 0000000..657f539 --- /dev/null +++ b/apps/backend/src/import/interfaces/web-scraper.interface.ts @@ -0,0 +1,23 @@ +export interface WebScraperResult { + content: string; + metadata: { + sourceName: string; + url: string; + title?: string; + author?: string; + publishedDate?: string; + [key: string]: any; + }; +} + +export interface IWebScraper { + /** + * Check if this scraper can handle the given URL + */ + canHandle(url: string): boolean; + + /** + * Scrape content from the URL + */ + scrape(url: string): Promise; +} diff --git a/apps/backend/src/import/scrapers/index.ts b/apps/backend/src/import/scrapers/index.ts new file mode 100644 index 0000000..1c5699a --- /dev/null +++ b/apps/backend/src/import/scrapers/index.ts @@ -0,0 +1 @@ +export { SakurazakaScraper } from './sakurazaka-scraper'; diff --git a/apps/backend/src/import/scrapers/sakurazaka-scraper.spec.ts b/apps/backend/src/import/scrapers/sakurazaka-scraper.spec.ts new file mode 100644 index 0000000..03b5db9 --- /dev/null +++ b/apps/backend/src/import/scrapers/sakurazaka-scraper.spec.ts @@ -0,0 +1,337 @@ +import { SakurazakaScraper } from './sakurazaka-scraper'; + +describe('SakurazakaScraper', () => { + let scraper: SakurazakaScraper; + + beforeEach(() => { + scraper = new SakurazakaScraper(); + }); + + describe('canHandle', () => { + it('should return true for sakurazaka46.com URLs', () => { + const urls = [ + 'https://sakurazaka46.com/s/s46/diary/detail/68008', + 'https://sakurazaka46.com/s/s46/diary/detail/68008?ima=0000&cd=blog', + 'https://www.sakurazaka46.com/s/s46/diary/detail/10000', + 'https://sakurazaka46.com/s/s46/diary/blog/list', + ]; + + urls.forEach((url) => { + expect(scraper.canHandle(url)).toBe(true); + }); + }); + + it('should return false for non-sakurazaka46.com URLs', () => { + const urls = [ + 'https://example.com/blog/123', + 'https://nogizaka46.com/s/n46/diary/detail/12345', + 'https://hinatazaka46.com/s/h46/diary/detail/12345', + 'https://sakurazaka46.net/fake/blog/123', + 'not-a-url', + '', + ]; + + urls.forEach((url) => { + expect(scraper.canHandle(url)).toBe(false); + }); + }); + + it('should return false for invalid URLs', () => { + expect(scraper.canHandle('invalid-url')).toBe(false); + expect(scraper.canHandle('')).toBe(false); + }); + }); + + describe('scrape', () => { + const mockBlogHtml = ` +
+
+
+
+
+
+
+
+ 2026 + 2 +
+

18

+
+
+
+
+
+
+
+

The growing up train

+
+
+
+

YU MURAI

+
+
+
+
+



こんばんは

14thシングル

嬉しいです

+

四期生は肝が据わっています!

+ +

またね〜
村井優

+
+ +
+
+

櫻坂46メッセージ

+
+
+
+
+ `; + + beforeEach(() => { + global.fetch = jest.fn(); + }); + + afterEach(() => { + jest.resetAllMocks(); + }); + + it('should successfully scrape blog content', async () => { + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(mockBlogHtml), + }); + + const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008'); + + expect(result.content).toContain('Title: The growing up train'); + expect(result.content).toContain('Author: YU MURAI'); + expect(result.content).toContain('Date: 2026-02-18'); + expect(result.content).toContain('こんばんは'); + expect(result.content).toContain('[Image: mobpqiCQR.jpg]'); + expect(result.content).toContain('またね〜'); + expect(result.content).toContain('村井優'); + }); + + it('should extract correct metadata', async () => { + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(mockBlogHtml), + }); + + const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008'); + + expect(result.metadata).toEqual({ + sourceName: 'Sakurazaka46 Blog', + url: 'https://sakurazaka46.com/s/s46/diary/detail/68008', + title: 'The growing up train', + author: 'YU MURAI', + publishedDate: '2026-02-18', + blogId: '68008', + }); + }); + + it('should handle blog posts without title gracefully', async () => { + const htmlWithoutTitle = mockBlogHtml.replace( + /

[^<]*<\/h1>/, + '' + ); + + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(htmlWithoutTitle), + }); + + const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008'); + + expect(result.metadata.title).toBeFalsy(); + }); + + it('should handle blog posts without author gracefully', async () => { + const htmlWithoutAuthor = mockBlogHtml.replace( + /

[^<]*<\/p>/, + '' + ); + + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(htmlWithoutAuthor), + }); + + const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008'); + + expect(result.metadata.author).toBeFalsy(); + }); + + it('should handle HTTP errors', async () => { + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: false, + status: 404, + statusText: 'Not Found', + }); + + await expect( + scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/99999') + ).rejects.toThrow('Failed to fetch page: 404 Not Found'); + }); + + it('should handle network errors', async () => { + (global.fetch as jest.Mock).mockRejectedValueOnce(new Error('Network error')); + + await expect( + scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008') + ).rejects.toThrow('Failed to scrape Sakurazaka46 blog: Network error'); + }); + + it('should handle missing article element', async () => { + const htmlWithoutArticle = '

No article here
'; + + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(htmlWithoutArticle), + }); + + await expect( + scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008') + ).rejects.toThrow('Failed to scrape Sakurazaka46 blog: Could not find article content in the page'); + }); + + it('should handle blog ID extraction from various URL formats', async () => { + (global.fetch as jest.Mock).mockResolvedValue({ + ok: true, + text: () => Promise.resolve(mockBlogHtml), + }); + + const testCases = [ + { url: 'https://sakurazaka46.com/s/s46/diary/detail/68008', expectedId: '68008' }, + { url: 'https://sakurazaka46.com/s/s46/diary/detail/68008?ima=0000&cd=blog', expectedId: '68008' }, + { url: 'https://sakurazaka46.com/s/s46/diary/detail/12345#anchor', expectedId: '12345' }, + ]; + + for (const { url, expectedId } of testCases) { + const result = await scraper.scrape(url); + expect(result.metadata.blogId).toBe(expectedId); + } + }); + + it('should clean up HTML entities and tags properly', async () => { + const htmlWithEntities = ` +
+

Test & Example

+
+

Hello world <script>alert(1)</script>

+

Line 1
Line 2

+
+
+ `; + + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(htmlWithEntities), + }); + + const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/1'); + + expect(result.content).toContain('Test & Example'); + expect(result.content).toContain('Hello world'); + expect(result.content).not.toContain(' '); + expect(result.content).not.toContain('<script>'); + expect(result.content).not.toContain('
'); + }); + + it('should remove navigation and footer sections', async () => { + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(mockBlogHtml), + }); + + const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008'); + + // Navigation elements should not be in the content + expect(result.content).not.toContain('前へ'); + expect(result.content).not.toContain('村井 優のブログ一覧'); + expect(result.content).not.toContain('櫻坂46メッセージ'); + expect(result.content).not.toContain('blog-foot-nav'); + expect(result.content).not.toContain('app-button'); + }); + + it('should handle multiple images in blog post', async () => { + const htmlWithMultipleImages = ` +
+

Multi Image Post

+
+ +

Text between images

+ + +

After images text

+
+
+ `; + + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(htmlWithMultipleImages), + }); + + const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/1'); + + expect(result.content).toContain('[Image: image1.jpg]'); + expect(result.content).toContain('[Image: image2.png]'); + expect(result.content).toContain('[Image: image3.gif]'); + expect(result.content).toContain('Text between images'); + expect(result.content).toContain('After images text'); + }); + + it('should extract all images from complex blog structure', async () => { + const htmlWithComplexImages = ` +
+
+
+

+

First paragraph

+


+

Second paragraph with inline image

+
+
+
+ `; + + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(htmlWithComplexImages), + }); + + const result = await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/1'); + + expect(result.content).toContain('[Image: blog1.jpg]'); + expect(result.content).toContain('[Image: blog2.jpg]'); + expect(result.content).toContain('[Image: blog3.jpg]'); + expect(result.content).toContain('[Image: blog4.jpg]'); + expect(result.content).toContain('First paragraph'); + expect(result.content).toContain('Second paragraph'); + }); + + it('should set correct User-Agent header', async () => { + (global.fetch as jest.Mock).mockResolvedValueOnce({ + ok: true, + text: () => Promise.resolve(mockBlogHtml), + }); + + await scraper.scrape('https://sakurazaka46.com/s/s46/diary/detail/68008'); + + expect(global.fetch).toHaveBeenCalledWith( + 'https://sakurazaka46.com/s/s46/diary/detail/68008', + expect.objectContaining({ + headers: expect.objectContaining({ + 'User-Agent': expect.stringContaining('Mozilla/5.0'), + }), + }) + ); + }); + }); +}); diff --git a/apps/backend/src/import/scrapers/sakurazaka-scraper.ts b/apps/backend/src/import/scrapers/sakurazaka-scraper.ts new file mode 100644 index 0000000..3183132 --- /dev/null +++ b/apps/backend/src/import/scrapers/sakurazaka-scraper.ts @@ -0,0 +1,238 @@ +import { Logger } from '@nestjs/common'; +import { IWebScraper, WebScraperResult } from '../interfaces/web-scraper.interface'; + +/** + * Web scraper for Sakurazaka46 blog posts + * Handles URLs like: https://sakurazaka46.com/s/s46/diary/detail/{id} + */ +export class SakurazakaScraper implements IWebScraper { + private readonly logger = new Logger(SakurazakaScraper.name); + private readonly baseUrl = 'sakurazaka46.com'; + + canHandle(url: string): boolean { + try { + const urlObj = new URL(url); + return ( + urlObj.hostname === this.baseUrl || + urlObj.hostname === `www.${this.baseUrl}` + ); + } catch { + return false; + } + } + + async scrape(url: string): Promise { + this.logger.log(`Starting to scrape URL: ${url}`); + + try { + // Fetch the page content + this.logger.debug(`Fetching page content from: ${url}`); + const response = await fetch(url, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + }, + }); + + if (!response.ok) { + this.logger.error(`Failed to fetch page: ${response.status} ${response.statusText}`); + throw new Error(`Failed to fetch page: ${response.status} ${response.statusText}`); + } + + this.logger.debug(`Page fetched successfully, reading content...`); + const html = await response.text(); + this.logger.debug(`HTML content received: ${html.length} bytes`); + + // Extract content + this.logger.debug('Extracting content from HTML...'); + const content = this.extractContent(html); + this.logger.log(`Content extracted: ${content.length} characters`); + + const metadata = this.extractMetadata(html, url); + this.logger.log(`Metadata extracted:`, metadata); + + return { + content, + metadata, + }; + } catch (error) { + this.logger.error(`Failed to scrape Sakurazaka46 blog:`, error); + throw new Error( + `Failed to scrape Sakurazaka46 blog: ${error instanceof Error ? error.message : 'Unknown error'}` + ); + } + } + + private extractContent(html: string): string { + // Find the article element + const articleMatch = html.match(/]*>([\s\S]*?)<\/article>/i); + if (!articleMatch) { + throw new Error('Could not find article content in the page'); + } + + const article = articleMatch[0]; + const parts: string[] = []; + + // Extract title + const titleMatch = article.match(/]*class="[^"]*title[^"]*"[^>]*>([\s\S]*?)<\/h1>/i); + if (titleMatch) { + const title = this.stripHtml(titleMatch[1]).trim(); + if (title) parts.push(`Title: ${title}`); + } + + // Extract author + const authorMatch = article.match(/]*class="[^"]*eigo[^"]*"[^>]*>([\s\S]*?)<\/p>/i); + if (authorMatch) { + const author = this.stripHtml(authorMatch[1]).trim(); + if (author) parts.push(`Author: ${author}`); + } + + // Extract date from the calendar structure + const yearMatch = article.match(/]*class="[^"]*ym-year[^"]*"[^>]*>([\s\S]*?)<\/span>/i); + const monthMatch = article.match(/]*class="[^"]*ym-month[^"]*"[^>]*>([\s\S]*?)<\/span>/i); + const dayMatch = article.match(/]*class="[^"]*date[^"]*"[^>]*>([\s\S]*?)<\/p>/i); + + if (yearMatch && monthMatch && dayMatch) { + const year = this.stripHtml(yearMatch[1]).trim(); + const month = this.stripHtml(monthMatch[1]).trim(); + const day = this.stripHtml(dayMatch[1]).trim(); + parts.push(`Date: ${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}`); + } + + // Extract main content from box-article + // Find box-article and capture until we hit navigation sections + const boxMatch = article.match(/]*class="[^"]*box-article[^"]*"[^>]*>([\s\S]*)$/i); + if (boxMatch) { + let content = boxMatch[1]; + // Find where navigation starts and cut there + const navIndex = content.search(/]*class="[^"]*col-l[^"]*"[^>]*>/i); + if (navIndex !== -1) { + content = content.substring(0, navIndex); + } + // Also cut at app-button section + const appIndex = content.search(/]*class="[^"]*app-button[^"]*"/i); + if (appIndex !== -1) { + content = content.substring(0, appIndex); + } + const mainContent = this.cleanBlogContent(content); + if (mainContent) { + parts.push(''); + parts.push(mainContent); + } + } + + return parts.join('\n'); + } + + private extractMetadata(html: string, url: string): WebScraperResult['metadata'] { + const metadata: WebScraperResult['metadata'] = { + sourceName: 'Sakurazaka46 Blog', + url, + }; + + // Extract title + const titleMatch = html.match(/]*class="[^"]*title[^"]*"[^>]*>([\s\S]*?)<\/h1>/i); + if (titleMatch) { + metadata.title = this.stripHtml(titleMatch[1]).trim(); + } + + // Extract author + const authorMatch = html.match(/]*class="[^"]*eigo[^"]*"[^>]*>([\s\S]*?)<\/p>/i); + if (authorMatch) { + metadata.author = this.stripHtml(authorMatch[1]).trim(); + } + + // Extract date + const yearMatch = html.match(/]*class="[^"]*ym-year[^"]*"[^>]*>([\s\S]*?)<\/span>/i); + const monthMatch = html.match(/]*class="[^"]*ym-month[^"]*"[^>]*>([\s\S]*?)<\/span>/i); + const dayMatch = html.match(/]*class="[^"]*date[^"]*"[^>]*>([\s\S]*?)<\/p>/i); + + if (yearMatch && monthMatch && dayMatch) { + const year = this.stripHtml(yearMatch[1]).trim(); + const month = this.stripHtml(monthMatch[1]).trim().padStart(2, '0'); + const day = this.stripHtml(dayMatch[1]).trim().padStart(2, '0'); + metadata.publishedDate = `${year}-${month}-${day}`; + } + + // Extract blog ID from URL + const idMatch = url.match(/detail\/(\d+)/); + if (idMatch) { + metadata.blogId = idMatch[1]; + } + + return metadata; + } + + private stripHtml(html: string): string { + return html + .replace(//gi, '\n') + .replace(/<\/p>/gi, '\n') + .replace(/<[^>]+>/g, '') + .replace(/ /g, ' ') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'") + .trim(); + } + + private cleanBlogContent(html: string): string { + // Remove HTML comments + html = html.replace(//g, ''); + + // Remove blog navigation/footer section + html = html.replace(/]*class="[^"]*blog-foot-nav[^"]*"[\s\S]*$/i, ''); + + // Remove app button section + html = html.replace(/]*class="[^"]*app-button[^"]*"[\s\S]*$/i, ''); + + // Remove script and style tags + let cleaned = html + .replace(/]*>[\s\S]*?<\/script>/gi, '') + .replace(/]*>[\s\S]*?<\/style>/gi, ''); + + // Convert
tags to newlines + cleaned = cleaned.replace(//gi, '\n'); + + // Handle image tags - convert to descriptive text + cleaned = cleaned.replace(/]*src="([^"]*)"[^>]*>/gi, (match, src) => { + // Extract filename from src + const filename = src.split('/').pop(); + return `\n[Image: ${filename}]\n`; + }); + + // Convert paragraph closings to double newlines + cleaned = cleaned.replace(/<\/p>/gi, '\n\n'); + + // Convert div closings to single newlines + cleaned = cleaned.replace(/<\/div>/gi, '\n'); + + // Remove remaining HTML tags + cleaned = cleaned.replace(/<[^>]+>/g, ''); + + // Decode HTML entities + cleaned = cleaned + .replace(/ /g, ' ') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'"); + + // Clean up excessive whitespace + cleaned = cleaned + .replace(/\n{4,}/g, '\n\n\n') + .trim(); + + // Remove trailing empty image references + cleaned = cleaned.replace(/\n*\[Image:[^\]]*\]\s*$/g, ''); + + // Remove trailing author/date section if present + cleaned = cleaned.replace(/\s+村井\s*優\s*\d{4}\/\d{2}\/\d{2}\s*\d{2}:\d{2}\s*$/g, ''); + + // Final trim + cleaned = cleaned.trim(); + + return cleaned; + } +} diff --git a/apps/backend/src/vector/embedding.service.ts b/apps/backend/src/vector/embedding.service.ts index 98c905c..9a1eb7f 100644 --- a/apps/backend/src/vector/embedding.service.ts +++ b/apps/backend/src/vector/embedding.service.ts @@ -1,13 +1,15 @@ -import { Injectable } from '@nestjs/common'; +import { Injectable, Logger } from '@nestjs/common'; import { IEmbeddingProvider } from './interfaces/embedding-provider.interface'; import { LocalEmbeddingProvider } from './providers/local-embedding.provider'; @Injectable() export class EmbeddingService { + private readonly logger = new Logger(EmbeddingService.name); private provider: IEmbeddingProvider; constructor() { const providerType = process.env.EMBEDDING_PROVIDER || 'local'; + this.logger.log(`Initializing EmbeddingService with provider: ${providerType}`); switch (providerType) { case 'local': @@ -19,7 +21,11 @@ export class EmbeddingService { } async embed(text: string): Promise { - return this.provider.embed(text); + this.logger.debug(`Generating embedding for text (${text.length} chars)...`); + const startTime = Date.now(); + const result = await this.provider.embed(text); + this.logger.debug(`Embedding generated in ${Date.now() - startTime}ms`); + return result; } async embedBatch(texts: string[]): Promise { diff --git a/apps/backend/src/vector/memory.service.ts b/apps/backend/src/vector/memory.service.ts index 524e192..b5bb138 100644 --- a/apps/backend/src/vector/memory.service.ts +++ b/apps/backend/src/vector/memory.service.ts @@ -1,4 +1,4 @@ -import { Injectable } from '@nestjs/common'; +import { Injectable, Logger } from '@nestjs/common'; import { EmbeddingService } from './embedding.service'; import { VectorStoreService, SearchResult } from './vector-store.service'; import { MemoryType } from '@prisma/client'; @@ -11,6 +11,8 @@ export interface MemoryContext { @Injectable() export class MemoryService { + private readonly logger = new Logger(MemoryService.name); + constructor( private embeddingService: EmbeddingService, private vectorStore: VectorStoreService, @@ -26,8 +28,15 @@ export class MemoryService { metadata?: any; }, ): Promise { + const startTime = Date.now(); + this.logger.debug(`[${options.knowledgeId || options.conversationId}] Generating embedding for content (${content.length} chars)...`); + const embedding = await this.embeddingService.embed(content); + this.logger.debug(`[${options.knowledgeId || options.conversationId}] Embedding generated in ${Date.now() - startTime}ms, dimension: ${embedding.length}`); + + this.logger.debug(`[${options.knowledgeId || options.conversationId}] Storing in vector store...`); await this.vectorStore.store(content, embedding, memoryType, options); + this.logger.debug(`[${options.knowledgeId || options.conversationId}] Stored in vector store in ${Date.now() - startTime}ms`); } async retrieveRelevantMemories( @@ -40,9 +49,20 @@ export class MemoryService { memoryType?: MemoryType; }, ): Promise { + const { limit = 5, threshold = 0.6, conversationId, characterId, memoryType } = options; + + this.logger.debug( + `[retrieveRelevantMemories] Query: "${query.substring(0, 100)}...", type: ${memoryType}, characterId: ${characterId}, conversationId: ${conversationId}, threshold: ${threshold}`, + ); + const embedding = await this.embeddingService.embed(query); const results = await this.vectorStore.searchSimilar(embedding, options); + this.logger.debug(`[retrieveRelevantMemories] Found ${results.length} results for type ${memoryType}:`); + results.forEach((r, i) => { + this.logger.debug(` [${i}] similarity: ${r.similarity.toFixed(4)}, content: "${r.content.substring(0, 80)}..."`); + }); + return results.map((result) => ({ content: result.content, metadata: result.metadata, @@ -50,37 +70,44 @@ export class MemoryService { })); } - async buildContextForConversation( - conversationId: string, - currentMessage: string, - characterId: string, - ): Promise { - // Retrieve recent conversation memories - const conversationMemories = await this.retrieveRelevantMemories( - currentMessage, - { - limit: 3, - threshold: 0.6, - conversationId, - memoryType: 'conversation', - }, + async buildContextForConversation(conversationId: string, currentMessage: string, characterId: string): Promise { + this.logger.debug( + `[buildContextForConversation] Building context for conversation ${conversationId}, character ${characterId}, message: "${currentMessage.substring(0, 100)}"`, ); - // Retrieve character knowledge - const characterMemories = await this.retrieveRelevantMemories( - currentMessage, - { - limit: 3, - threshold: 0.7, - characterId, - memoryType: 'character', - }, - ); + // Retrieve recent conversation memories + const conversationMemories = await this.retrieveRelevantMemories(currentMessage, { + limit: 3, + threshold: 0.6, + conversationId, + memoryType: 'conversation', + }); + + // Retrieve character knowledge - using multilingual embedding model for cross-lingual support + // Lower threshold (0.3) for cross-lingual matching (English query -> Japanese content) + let characterMemories = await this.retrieveRelevantMemories(currentMessage, { + limit: 3, + threshold: 0.3, + characterId, + memoryType: 'character', + }); + + // Fallback: If vector search returns no results, retrieve the most recent knowledge + // This ensures the character always has some context for roleplaying + if (characterMemories.length === 0) { + this.logger.debug(`[buildContextForConversation] No similar memories found, falling back to most recent knowledge`); + const recentResults = await this.vectorStore.getRecentByCharacterId(characterId, 2); + characterMemories = recentResults.map((r) => ({ + content: r.content, + metadata: r.metadata, + similarity: r.similarity, + })); + } const contextParts: string[] = []; if (characterMemories.length > 0) { - contextParts.push('Relevant character knowledge:'); + contextParts.push('Here are some descriptions of yourself. These knowledge are your history, what you have done, talked, or imagined:'); characterMemories.forEach((memory) => { contextParts.push(`- ${memory.content}`); }); @@ -93,30 +120,26 @@ export class MemoryService { }); } - return contextParts.join('\n'); + const result = contextParts.join('\n'); + this.logger.debug(`[buildContextForConversation] Final context (${result.length} chars):\n${result || '(empty)'}`); + + return result; } - async storeConversationMessage( - content: string, - conversationId: string, - metadata?: any, - ): Promise { + async storeConversationMessage(content: string, conversationId: string, metadata?: any): Promise { await this.addMemory(content, 'conversation', { conversationId, metadata, }); } - async storeCharacterKnowledge( - content: string, - characterId: string, - knowledgeId: string, - metadata?: any, - ): Promise { + async storeCharacterKnowledge(content: string, characterId: string, knowledgeId: string, metadata?: any): Promise { + this.logger.debug(`[${knowledgeId}] Storing character knowledge chunk for character: ${characterId}`); await this.addMemory(content, 'character', { characterId, knowledgeId, metadata, }); + this.logger.debug(`[${knowledgeId}] Character knowledge chunk stored successfully`); } } diff --git a/apps/backend/src/vector/providers/local-embedding.provider.ts b/apps/backend/src/vector/providers/local-embedding.provider.ts index b52b511..5056642 100644 --- a/apps/backend/src/vector/providers/local-embedding.provider.ts +++ b/apps/backend/src/vector/providers/local-embedding.provider.ts @@ -1,4 +1,4 @@ -import { Injectable, OnModuleInit } from '@nestjs/common'; +import { Injectable, OnModuleInit, Logger } from '@nestjs/common'; import { IEmbeddingProvider } from '../interfaces/embedding-provider.interface'; import { pipeline, FeatureExtractionPipeline } from '@xenova/transformers'; @@ -7,6 +7,8 @@ export class LocalEmbeddingProvider implements IEmbeddingProvider, OnModuleInit private extractor: FeatureExtractionPipeline | null = null; private readonly modelName: string; private readonly dimension: number; + private readonly logger = new Logger(LocalEmbeddingProvider.name); + private isLoading = false; constructor() { this.modelName = process.env.EMBEDDING_MODEL || 'Xenova/all-MiniLM-L6-v2'; @@ -15,31 +17,72 @@ export class LocalEmbeddingProvider implements IEmbeddingProvider, OnModuleInit async onModuleInit() { // Lazy initialization - model will be loaded on first use + this.logger.log(`LocalEmbeddingProvider initialized with model: ${this.modelName}`); } private async getExtractor(): Promise { if (!this.extractor) { - this.extractor = await pipeline('feature-extraction', this.modelName, { - quantized: false, // Use full precision for better quality - }); + if (this.isLoading) { + // Wait for existing load to complete + while (this.isLoading) { + await new Promise((resolve) => setTimeout(resolve, 100)); + } + if (this.extractor) { + return this.extractor; + } + } + + this.isLoading = true; + this.logger.log(`Loading embedding model: ${this.modelName}...`); + + try { + // Use quantized model to reduce memory usage + this.extractor = await pipeline('feature-extraction', this.modelName, { + quantized: true, + revision: 'main', + }); + this.logger.log('Embedding model loaded successfully'); + } catch (error) { + this.logger.error('Failed to load embedding model:', error); + throw error; + } finally { + this.isLoading = false; + } } return this.extractor; } async embed(text: string): Promise { + // Truncate text to prevent excessive memory usage + const maxLength = 512; // Maximum tokens the model can handle efficiently + const truncatedText = text.length > maxLength * 4 ? text.substring(0, maxLength * 4) : text; + const extractor = await this.getExtractor(); - const output = await extractor(text, { pooling: 'mean', normalize: true }); - return Array.from(output.data as Float32Array); + const output = await extractor(truncatedText, { pooling: 'mean', normalize: true }); + + // Convert to array - this creates a copy, allowing original to be GC'd + const result = Array.from(output.data as Float32Array); + + // Add delay to allow GC between embeddings + await new Promise((resolve) => setTimeout(resolve, 10)); + + return result; } async embedBatch(texts: string[]): Promise { - const extractor = await this.getExtractor(); - const outputs = await Promise.all( - texts.map((text) => - extractor(text, { pooling: 'mean', normalize: true }), - ), - ); - return outputs.map((output) => Array.from(output.data as Float32Array)); + const results: number[][] = []; + + // Process one at a time to minimize memory spikes + for (let i = 0; i < texts.length; i++) { + results.push(await this.embed(texts[i])); + + // Add delay every few items to allow GC + if (i > 0 && i % 2 === 0) { + await new Promise((resolve) => setTimeout(resolve, 50)); + } + } + + return results; } getDimension(): number { diff --git a/apps/backend/src/vector/vector-store.service.ts b/apps/backend/src/vector/vector-store.service.ts index b070c0d..c0a363d 100644 --- a/apps/backend/src/vector/vector-store.service.ts +++ b/apps/backend/src/vector/vector-store.service.ts @@ -1,4 +1,4 @@ -import { Injectable } from '@nestjs/common'; +import { Injectable, Logger } from '@nestjs/common'; import { PrismaService } from '../prisma/prisma.service'; import { MemoryType, VectorMemory } from '@prisma/client'; @@ -12,6 +12,8 @@ export interface SearchResult { @Injectable() export class VectorStoreService { + private readonly logger = new Logger(VectorStoreService.name); + constructor(private prisma: PrismaService) {} async store( @@ -25,23 +27,35 @@ export class VectorStoreService { metadata?: any; }, ): Promise { + const startTime = Date.now(); const vectorString = `[${embedding.join(',')}]`; + const metadataJson = options.metadata ? JSON.stringify(options.metadata) : null; + + this.logger.debug(`[${options.knowledgeId}] Storing vector memory, type: ${memoryType}, vector dim: ${embedding.length}`); - return this.prisma.$queryRaw` - INSERT INTO "VectorMemory" (id, content, embedding, "memoryType", metadata, "conversationId", "characterId", "knowledgeId", "createdAt") - VALUES ( - gen_random_uuid(), - ${content}, - ${vectorString}::vector, - ${memoryType}, - ${options.metadata ? JSON.stringify(options.metadata) : null}::jsonb, - ${options.conversationId || null}, - ${options.characterId || null}, - ${options.knowledgeId || null}, - NOW() - ) - RETURNING * - `.then((results) => results[0]); + try { + const result = await this.prisma.$queryRaw` + INSERT INTO "VectorMemory" (id, content, embedding, "memoryType", metadata, "conversationId", "characterId", "knowledgeId", "createdAt") + VALUES ( + gen_random_uuid(), + ${content}, + ${vectorString}::vector, + ${memoryType}, + ${metadataJson}::jsonb, + ${options.conversationId || null}, + ${options.characterId || null}, + ${options.knowledgeId || null}, + NOW() + ) + RETURNING * + `.then((results) => results[0]); + + this.logger.debug(`[${options.knowledgeId}] Vector memory stored in ${Date.now() - startTime}ms, id: ${result.id}`); + return result; + } catch (error) { + this.logger.error(`[${options.knowledgeId}] Failed to store vector memory:`, error); + throw error; + } } async searchSimilar( @@ -93,7 +107,34 @@ export class VectorStoreService { LIMIT $3 `; - return this.prisma.$queryRawUnsafe(query, ...params); + this.logger.debug(`[searchSimilar] Query params: threshold=${threshold}, limit=${limit}, characterId=${options.characterId}, memoryType=${options.memoryType}`); + + const results = await this.prisma.$queryRawUnsafe(query, ...params); + + this.logger.debug(`[searchSimilar] Found ${results.length} results matching criteria`); + + // Debug: Show all similarities for character knowledge + if (options.characterId && options.memoryType === 'character') { + const allQuery = ` + SELECT + id, + content, + "memoryType", + metadata, + 1 - (embedding <=> $1::vector) as similarity + FROM "VectorMemory" + WHERE "characterId" = $2 AND "memoryType" = $3 + ORDER BY embedding <=> $1::vector + LIMIT 10 + `; + const allResults = await this.prisma.$queryRawUnsafe(allQuery, vectorString, options.characterId, options.memoryType); + this.logger.debug(`[searchSimilar] All ${allResults.length} similarities for character ${options.characterId}:`); + allResults.forEach((r, i) => { + this.logger.debug(` [${i}] similarity=${r.similarity.toFixed(4)}, content="${r.content.substring(0, 50)}..."`); + }); + } + + return results; } async deleteByConversation(conversationId: string): Promise { @@ -113,4 +154,28 @@ export class VectorStoreService { where: { knowledgeId }, }); } + + /** + * Retrieve most recent character knowledge (fallback when similarity search returns nothing) + */ + async getRecentByCharacterId(characterId: string, limit: number = 2): Promise { + this.logger.debug(`[getRecentByCharacterId] Retrieving recent knowledge for character ${characterId}`); + + const results = await this.prisma.$queryRaw` + SELECT + id, + content, + "memoryType", + metadata, + 1.0 as similarity + FROM "VectorMemory" + WHERE "characterId" = ${characterId} AND "memoryType" = 'character' + ORDER BY "createdAt" DESC + LIMIT ${limit} + `; + + this.logger.debug(`[getRecentByCharacterId] Found ${results.length} recent memories`); + + return results; + } } diff --git a/apps/frontend/src/App.tsx b/apps/frontend/src/App.tsx index 8dd33d0..f707b93 100644 --- a/apps/frontend/src/App.tsx +++ b/apps/frontend/src/App.tsx @@ -6,6 +6,7 @@ import { CharacterList } from './pages/CharacterList'; import { CharacterForm } from './pages/CharacterForm'; import { ConversationList } from './pages/ConversationList'; import { Chat } from './pages/Chat'; +import { KnowledgeImport } from './pages/KnowledgeImport'; // OAuth Callback Handler - processes tokens from URL before routing function OAuthCallbackHandler({ children }: { children: React.ReactNode }) { @@ -124,6 +125,15 @@ function App() { } /> + + + + + } + /> diff --git a/apps/frontend/src/api/generated/import/import.ts b/apps/frontend/src/api/generated/import/import.ts index 9b137a7..b3e0a95 100644 --- a/apps/frontend/src/api/generated/import/import.ts +++ b/apps/frontend/src/api/generated/import/import.ts @@ -6,7 +6,9 @@ * OpenAPI spec version: 1.0.0 */ import type { - ImportControllerUploadFileBody + ImportControllerUploadFileBody, + ImportUrlDto, + UploadResponseDto, } from '.././model'; import { customFetch } from '../../mutator/custom-fetch'; @@ -109,3 +111,29 @@ export const importControllerDeleteKnowledge = async (knowledgeId: string, optio );} + +/** + * @summary Import content from URL for character knowledge + */ +export const getImportControllerImportFromUrlUrl = (characterId: string,) => { + + + + + return `http://localhost:3000/api/import/characters/${characterId}/url` +} + +export const importControllerImportFromUrl = async (characterId: string, + importUrlDto: ImportUrlDto, options?: RequestInit): Promise => { + + return customFetch(getImportControllerImportFromUrlUrl(characterId), + { + ...options, + method: 'POST', + headers: { + ...options?.headers, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(importUrlDto), + } +);} diff --git a/apps/frontend/src/api/generated/model/importUrlDto.ts b/apps/frontend/src/api/generated/model/importUrlDto.ts new file mode 100644 index 0000000..e688503 --- /dev/null +++ b/apps/frontend/src/api/generated/model/importUrlDto.ts @@ -0,0 +1,12 @@ +/** + * Generated by orval v8.4.2 🍺 + * Do not edit manually. + * DreamChat API + * The DreamChat API documentation + * OpenAPI spec version: 1.0.0 + */ + +export interface ImportUrlDto { + /** URL to import */ + url: string; +} diff --git a/apps/frontend/src/api/generated/model/index.ts b/apps/frontend/src/api/generated/model/index.ts index 43553dd..17edb58 100644 --- a/apps/frontend/src/api/generated/model/index.ts +++ b/apps/frontend/src/api/generated/model/index.ts @@ -20,7 +20,9 @@ export * from './createCharacterDtoAttributes'; export * from './createCharacterDtoConfig'; export * from './createConversationDto'; export * from './importControllerUploadFileBody'; +export * from './importUrlDto'; export * from './keycloakConfigDto'; +export * from './uploadResponseDto'; export * from './keycloakLoginUrlDto'; export * from './loginDto'; export * from './messageResponseDto'; diff --git a/apps/frontend/src/api/generated/model/uploadResponseDto.ts b/apps/frontend/src/api/generated/model/uploadResponseDto.ts new file mode 100644 index 0000000..524c74d --- /dev/null +++ b/apps/frontend/src/api/generated/model/uploadResponseDto.ts @@ -0,0 +1,14 @@ +/** + * Generated by orval v8.4.2 🍺 + * Do not edit manually. + * DreamChat API + * The DreamChat API documentation + * OpenAPI spec version: 1.0.0 + */ + +export interface UploadResponseDto { + /** Knowledge ID */ + knowledgeId: string; + /** Status message */ + message: string; +} diff --git a/apps/frontend/src/pages/CharacterForm.tsx b/apps/frontend/src/pages/CharacterForm.tsx index 854b213..853805f 100644 --- a/apps/frontend/src/pages/CharacterForm.tsx +++ b/apps/frontend/src/pages/CharacterForm.tsx @@ -1,6 +1,7 @@ import { useEffect, useState } from 'react'; import { useNavigate, useParams, Link } from 'react-router-dom'; import { useCharacterStore } from '../stores/characterStore'; +import { importControllerGetCharacterKnowledge } from '../api/generated/import/import'; export function CharacterForm() { const { id } = useParams<{ id: string }>(); @@ -23,10 +24,17 @@ export function CharacterForm() { const [avatarUrl, setAvatarUrl] = useState(''); const [isPublic, setIsPublic] = useState(false); const [attributes, setAttributes] = useState('{}'); + const [knowledgeCount, setKnowledgeCount] = useState(0); useEffect(() => { if (isEditing && id) { getCharacter(id); + // Fetch knowledge count + importControllerGetCharacterKnowledge(id).then((knowledge: any) => { + setKnowledgeCount(knowledge?.length || 0); + }).catch(() => { + setKnowledgeCount(0); + }); } return () => { setCurrentCharacter(null); @@ -174,6 +182,28 @@ export function CharacterForm() { + {isEditing && id && ( +
+
+
+

Knowledge Sources

+

+ {knowledgeCount === 0 + ? 'No knowledge imported yet' + : `${knowledgeCount} knowledge source${knowledgeCount === 1 ? '' : 's'} imported` + } +

+
+ + {knowledgeCount === 0 ? 'Import Knowledge' : 'Manage Knowledge'} + +
+
+ )} +
+ +
+ + {/* Supported Sites Card */} +
+

+ Supported Websites +

+
    + {supportedPatterns.map((pattern, index) => ( +
  • + + {pattern.label} + + {pattern.pattern} + +
  • + ))} +
+

+ More websites will be supported in future updates. +

+
+ + + {/* Right column - Knowledge list */} +
+

+ Knowledge Sources +

+ + {isLoading ? ( +
+
+

Loading...

+
+ ) : knowledgeList.length === 0 ? ( +
+

No knowledge sources yet.

+

+ Import a blog URL to add knowledge to this character. +

+
+ ) : ( +
+ {knowledgeList.map((knowledge) => ( +
+
+
+
+ + {getStatusIcon(knowledge.status)} + {knowledge.status.charAt(0).toUpperCase() + knowledge.status.slice(1)} + + + {knowledge.sourceType === 'url' ? '🔗' : '📄'} {knowledge.sourceType} + +
+ +

+ {knowledge.name} +

+ + {knowledge.sourceType === 'url' && ( +

+ {truncateUrl(knowledge.sourceName)} +

+ )} + +

+ {formatDate(knowledge.createdAt)} +

+ + {knowledge.status === 'processing' && ( +

+ Processing content... This may take a moment. +

+ )} + + {knowledge.status === 'failed' && knowledge.processingInfo?.error && ( +

+ Error: {knowledge.processingInfo.error} +

+ )} + + {knowledge.status === 'completed' && knowledge.processingInfo?.chunksProcessed && ( +

+ ✓ Processed {knowledge.processingInfo.chunksProcessed} chunks +

+ )} +
+ + +
+
+ ))} +
+ )} +
+ + + + ); +} diff --git a/openapi/openapi.json b/openapi/openapi.json index b504d8f..cdb0f93 100644 --- a/openapi/openapi.json +++ b/openapi/openapi.json @@ -902,6 +902,59 @@ "import" ] } + }, + "/api/import/characters/{characterId}/url": { + "post": { + "operationId": "ImportController_importFromUrl", + "parameters": [ + { + "name": "characterId", + "required": true, + "in": "path", + "description": "Character ID", + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ImportUrlDto" + } + } + } + }, + "responses": { + "201": { + "description": "URL content is being imported and processed", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/UploadResponseDto" + } + } + } + }, + "400": { + "description": "Invalid URL or unsupported website" + }, + "401": { + "description": "Unauthorized" + } + }, + "security": [ + { + "bearer": [] + } + ], + "summary": "Import content from URL for character knowledge", + "tags": [ + "import" + ] + } } }, "info": { @@ -1462,6 +1515,36 @@ "userMessage", "assistantMessage" ] + }, + "ImportUrlDto": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "URL to import", + "example": "https://sakurazaka46.com/s/s46/diary/detail/68008" + } + }, + "required": [ + "url" + ] + }, + "UploadResponseDto": { + "type": "object", + "properties": { + "knowledgeId": { + "type": "string", + "description": "Knowledge ID" + }, + "message": { + "type": "string", + "description": "Status message" + } + }, + "required": [ + "knowledgeId", + "message" + ] } } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a4a4911..70dbf77 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -105,6 +105,9 @@ importers: '@types/bcrypt': specifier: ^6.0.0 version: 6.0.0 + '@types/jest': + specifier: ^30.0.0 + version: 30.0.0 '@types/jsonwebtoken': specifier: ^9.0.0 version: 9.0.10 @@ -126,6 +129,9 @@ importers: prisma: specifier: ^7.4.1 version: 7.4.1(@types/react@18.3.28)(react-dom@18.3.1)(react@18.3.1)(typescript@5.9.3) + ts-jest: + specifier: ^29.4.6 + version: 29.4.6(@babel/core@7.29.0)(jest@30.2.0)(typescript@5.9.3) typescript: specifier: ^5.3.0 version: 5.9.3 @@ -2984,6 +2990,13 @@ packages: '@types/istanbul-lib-report': 3.0.3 dev: true + /@types/jest@30.0.0: + resolution: {integrity: sha512-XTYugzhuwqWjws0CVz8QpM36+T+Dz5mTEBKhNs/esGLnCIlGdRy+Dq78NRjd7ls7r8BC8ZRMOrKlkO1hU0JOwA==} + dependencies: + expect: 30.2.0 + pretty-format: 30.2.0 + dev: true + /@types/json-schema@7.0.15: resolution: {integrity: sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==} dev: true @@ -3982,6 +3995,13 @@ packages: update-browserslist-db: 1.2.3(browserslist@4.28.1) dev: true + /bs-logger@0.2.6: + resolution: {integrity: sha512-pd8DCoxmbgc7hyPKOvxtqNcjYoOsABPQdcCUjGp3d42VR2CX1ORhk2A87oqqu5R1kk+76nsxZupkmyd+MVtCog==} + engines: {node: '>= 6'} + dependencies: + fast-json-stable-stringify: 2.1.0 + dev: true + /bser@2.1.1: resolution: {integrity: sha512-gQxTNE/GAfIIrmHLUE3oJyp5FO6HRBfhjnw4/wMmA63ZGDJnWBmgY/lyQBpnDUkGmAhbSe39tx2d/iTOAfglwQ==} dependencies: @@ -5474,6 +5494,19 @@ packages: resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==} dev: false + /handlebars@4.7.8: + resolution: {integrity: sha512-vafaFqs8MZkRrSX7sFVUdo3ap/eNiLnb4IakshzvP56X5Nr1iGKAIqdX6tMlm6HcNRIkr6AxO5jFEoJzzpT8aQ==} + engines: {node: '>=0.4.7'} + hasBin: true + dependencies: + minimist: 1.2.8 + neo-async: 2.6.2 + source-map: 0.6.1 + wordwrap: 1.0.0 + optionalDependencies: + uglify-js: 3.19.3 + dev: true + /has-flag@4.0.0: resolution: {integrity: sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==} engines: {node: '>=8'} @@ -6462,6 +6495,10 @@ packages: resolution: {integrity: sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==} dev: false + /lodash.memoize@4.1.2: + resolution: {integrity: sha512-t7j+NzmgnQzTAYXcsHYLgimltOV1MXHtlOWf6GjL9Kj8GK5FInw5JotxvbOs+IvV1/Dzo04/fCGfLVs7aXb4Ag==} + dev: true + /lodash.once@4.1.1: resolution: {integrity: sha512-Sb487aTOCr9drQVL8pIxOzVhafOjZN9UU54hiN8PU3uAiSV7lx1yYNpbNmex2PK6dSJoNTSJUUswT651yww3Mg==} dev: false @@ -6546,6 +6583,10 @@ packages: semver: 7.7.4 dev: true + /make-error@1.3.6: + resolution: {integrity: sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==} + dev: true + /makeerror@1.0.12: resolution: {integrity: sha512-JmqCvUhmt43madlpFzG4BQzG2Z3m6tvQDNKdClZnO3VbIudJYmxsT0FNJMeiB2+JTSlTQTSbU8QdesVmwJcmLg==} dependencies: @@ -8645,6 +8686,47 @@ packages: resolution: {integrity: sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==} dev: true + /ts-jest@29.4.6(@babel/core@7.29.0)(jest@30.2.0)(typescript@5.9.3): + resolution: {integrity: sha512-fSpWtOO/1AjSNQguk43hb/JCo16oJDnMJf3CdEGNkqsEX3t0KX96xvyX1D7PfLCpVoKu4MfVrqUkFyblYoY4lA==} + engines: {node: ^14.15.0 || ^16.10.0 || ^18.0.0 || >=20.0.0} + hasBin: true + peerDependencies: + '@babel/core': '>=7.0.0-beta.0 <8' + '@jest/transform': ^29.0.0 || ^30.0.0 + '@jest/types': ^29.0.0 || ^30.0.0 + babel-jest: ^29.0.0 || ^30.0.0 + esbuild: '*' + jest: ^29.0.0 || ^30.0.0 + jest-util: ^29.0.0 || ^30.0.0 + typescript: '>=4.3 <6' + peerDependenciesMeta: + '@babel/core': + optional: true + '@jest/transform': + optional: true + '@jest/types': + optional: true + babel-jest: + optional: true + esbuild: + optional: true + jest-util: + optional: true + dependencies: + '@babel/core': 7.29.0 + bs-logger: 0.2.6 + fast-json-stable-stringify: 2.1.0 + handlebars: 4.7.8 + jest: 30.2.0(@types/node@24.10.13) + json5: 2.2.3 + lodash.memoize: 4.1.2 + make-error: 1.3.6 + semver: 7.7.4 + type-fest: 4.41.0 + typescript: 5.9.3 + yargs-parser: 21.1.1 + dev: true + /tsconfck@3.1.6(typescript@5.9.3): resolution: {integrity: sha512-ks6Vjr/jEw0P1gmOVwutM3B7fWxoWBL2KRDb1JfqGVawBmO5UsvmWOQFGHBPl5yxYz4eERr19E6L7NMv+Fej4w==} engines: {node: ^18 || >=20} @@ -8774,6 +8856,14 @@ packages: resolution: {integrity: sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q==} dev: true + /uglify-js@3.19.3: + resolution: {integrity: sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ==} + engines: {node: '>=0.8.0'} + hasBin: true + requiresBuild: true + dev: true + optional: true + /uid@2.0.2: resolution: {integrity: sha512-u3xV3X7uzvi5b1MncmZo3i2Aw222Zk1keqLA1YkHldREkAhAqi65wuPfe7lHx8H/Wzy+8CE7S7uS3jekIM5s8g==} engines: {node: '>=8'} @@ -9117,6 +9207,10 @@ packages: stackback: 0.0.2 dev: true + /wordwrap@1.0.0: + resolution: {integrity: sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==} + dev: true + /wrap-ansi@6.2.0: resolution: {integrity: sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA==} engines: {node: '>=8'}