Complete Course Design Project

This commit is contained in:
bz彬彬 2026-01-08 14:11:42 +08:00
commit 545d50e7f1
23 changed files with 4195 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
.env
.venv/
__pycache__/
.DS_Store

25
README.md Normal file
View File

@ -0,0 +1,25 @@
# 运行说明
## 方法一:使用虚拟环境
1. 激活虚拟环境:
```bash
source .venv/bin/activate
```
2. 运行脚本:
```bash
python main.py
```
## 方法二:直接使用虚拟环境的 Python 解释器
```bash
.venv/bin/python main.py
```
## 注意事项
- `.env` 文件已包含 DeepSeek API Key无需修改
- 虚拟环境中已安装所有必要的依赖包openai、dotenv 等)
- 确保在 `test` 目录下运行命令

159
app.py Normal file
View File

@ -0,0 +1,159 @@
#!/Users/bzbb/Documents/work/1/test/.venv/bin/python3
import os
import tempfile
import json
from flask import Flask, render_template, request, Response, jsonify, redirect, url_for
from openai import OpenAI
from dotenv import load_dotenv
from knowledge_base import get_knowledge_base
# 加载环境变量
load_dotenv()
# 创建Flask应用
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = tempfile.gettempdir()
app.config['ALLOWED_EXTENSIONS'] = {'txt', 'pdf', 'doc', 'docx'}
# 初始化OpenAI客户端
client = OpenAI(
api_key=os.getenv("DEEPSEEK_API_KEY"),
base_url="https://api.deepseek.com"
)
# 初始化知识库
base = get_knowledge_base()
def allowed_file(filename):
"""检查文件是否被允许上传"""
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
@app.route('/')
def home():
"""首页路由"""
return render_template('index.html')
@app.route('/upload', methods=['POST'])
def upload_file():
"""上传文档到知识库"""
if 'file' not in request.files:
return jsonify({"error": "没有文件上传"}), 400
file = request.files['file']
if file.filename == '':
return jsonify({"error": "没有选择文件"}), 400
if file and allowed_file(file.filename):
try:
# 保存文件到临时目录
filename = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
file.save(filename)
# 获取额外的元数据
metadata = {}
if 'title' in request.form:
metadata['title'] = request.form['title']
# 添加文档到知识库
document_ids = base.add_document(file_path=filename)
# 删除临时文件
os.remove(filename)
return jsonify({
"success": True,
"message": "文档上传成功",
"document_ids": document_ids,
"count": len(document_ids)
})
except Exception as e:
return jsonify({"error": f"上传失败: {str(e)}"}), 500
return jsonify({"error": "不支持的文件类型"}), 400
@app.route('/documents', methods=['GET'])
def list_documents():
"""获取文档列表"""
documents = base.list_documents()
return jsonify({
"success": True,
"documents": documents,
"count": len(documents)
})
@app.route('/documents/<document_id>', methods=['DELETE'])
def delete_document(document_id):
"""删除文档"""
try:
result = base.delete_document(document_id)
if result:
return jsonify({"success": True, "message": "文档删除成功"})
else:
return jsonify({"error": "文档不存在"}), 404
except Exception as e:
return jsonify({"error": f"删除失败: {str(e)}"}), 500
@app.route('/search', methods=['POST'])
def search_documents():
"""搜索文档"""
try:
query = request.json.get('query', '')
n_results = request.json.get('n_results', 5)
hybrid_weight = request.json.get('hybrid_weight', 0.5)
if not query:
return jsonify({"error": "查询不能为空"}), 400
results = base.search(query, n_results, hybrid_weight)
return jsonify({
"success": True,
"results": results,
"count": len(results)
})
except Exception as e:
return jsonify({"error": f"搜索失败: {str(e)}"}), 500
@app.route('/ask', methods=['POST'])
def ask_question():
"""问答API"""
try:
query = request.json.get('query', '')
if not query:
return jsonify({"error": "问题不能为空"}), 400
# 1. 搜索相关文档
search_results = base.search(query, n_results=3)
# 2. 构建上下文
context = "\n\n".join([f"[文档{idx+1}] {result['content']}" for idx, result in enumerate(search_results)])
# 3. 构建Prompt
prompt = f"你是一个智能知识库助手,根据以下上下文回答用户问题。\n\n上下文:\n{context}\n\n用户问题:{query}\n\n要求:\n1. 基于上下文回答,不要编造信息\n2. 如果上下文没有相关信息,回答'我没有找到相关信息'\n3. 请引用来源文档,格式为[文档1]、[文档2]等\n4. 保持回答简洁明了\n\n回答:"
def generate():
"""流式生成回答"""
stream = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "你是一个智能知识库助手,根据提供的上下文回答用户问题。"},
{"role": "user", "content": prompt}
],
temperature=0.3,
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
return Response(generate(), mimetype='text/plain')
except Exception as e:
return jsonify({"error": f"问答失败: {str(e)}"}), 500
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=5001)

View File

@ -0,0 +1,8 @@
README.md
pyproject.toml
intelligent_knowledge_base.egg-info/PKG-INFO
intelligent_knowledge_base.egg-info/SOURCES.txt
intelligent_knowledge_base.egg-info/dependency_links.txt
intelligent_knowledge_base.egg-info/entry_points.txt
intelligent_knowledge_base.egg-info/requires.txt
intelligent_knowledge_base.egg-info/top_level.txt

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,2 @@
[console_scripts]
run = app:main

View File

@ -0,0 +1,15 @@
flask>=3.1.2
openai>=2.14.0
pydantic>=2.12.5
python-dotenv>=1.2.1
chromadb>=0.5.0
langchain>=0.2.0
langchain-community>=0.2.0
langchain-openai>=0.1.0
pypdf>=4.0.0
docx2txt>=0.8
numpy>=1.26.0
tqdm>=4.66.0
scikit-learn>=1.4.0
jieba>=0.42.1
whoosh>=2.7.4

View File

@ -0,0 +1 @@
templates

522
knowledge_base.py Normal file
View File

@ -0,0 +1,522 @@
#!/Users/bzbb/Documents/work/1/test/.venv/bin/python3
import os
import uuid
import time
import chromadb
import pypdf
import docx2txt
import jieba
from chromadb.config import Settings
from typing import List, Dict, Any, Optional
from datetime import datetime
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sklearn.feature_extraction.text import TfidfVectorizer
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.qparser import MultifieldParser
from whoosh.analysis import Tokenizer, Token
import tempfile
import shutil
# 自定义中文分词器类
class ChineseTokenizer(Tokenizer):
def __call__(self, text, **kwargs):
from jieba import cut
t = Token()
pos = 0
for word in cut(text):
t.text = word
t.boost = 1.0
t.start_pos = text.find(word, pos)
t.end_pos = t.start_pos + len(word)
t.pos = pos # 添加pos属性
yield t
pos = t.end_pos
def ChineseAnalyzer():
return ChineseTokenizer()
class KnowledgeBase:
"""智能知识库核心类,管理文档和向量数据库"""
def __init__(self, persist_directory: str = "./knowledge_base"):
"""初始化知识库
Args:
persist_directory: 向量数据库持久化目录
"""
self.persist_directory = persist_directory
# 初始化 ChromaDB 客户端
self.client = chromadb.PersistentClient(
path=persist_directory
)
# 创建或获取集合
self.collection = self.client.get_or_create_collection(
name="documents",
metadata={"description": "智能知识库文档集合"}
)
# 文档元数据存储
self.document_metadata = {}
self.load_metadata()
# 初始化文本分割器
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["\n\n", "\n", "", "", "", "", "", " "]
)
# 初始化稀疏检索Whoosh索引
self.sparse_index_dir = os.path.join(persist_directory, "sparse_index")
self.schema = Schema(
doc_id=ID(stored=True),
content=TEXT(stored=True, analyzer=ChineseAnalyzer()),
title=TEXT(stored=True),
file_path=STORED,
timestamp=STORED
)
# 创建或打开Whoosh索引
if not os.path.exists(self.sparse_index_dir):
os.makedirs(self.sparse_index_dir)
self.sparse_index = create_in(self.sparse_index_dir, self.schema)
else:
self.sparse_index = open_dir(self.sparse_index_dir)
# 初始化TF-IDF向量器
self.tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba.cut, use_idf=True)
def load_metadata(self):
"""加载文档元数据"""
# 在实际应用中,应该从持久化存储中加载
self.document_metadata = {}
def save_metadata(self):
"""保存文档元数据"""
# 在实际应用中,应该保存到持久化存储
pass
def parse_document(self, file_path: str) -> Dict[str, Any]:
"""解析不同格式的文档
Args:
file_path: 文件路径
Returns:
解析结果包含content和metadata
"""
file_extension = os.path.splitext(file_path)[1].lower()
content = ""
metadata = {
"file_path": file_path,
"file_type": file_extension,
"parsed_at": datetime.now().isoformat()
}
try:
if file_extension == ".pdf":
# 解析PDF
with open(file_path, "rb") as f:
reader = pypdf.PdfReader(f)
for page in reader.pages:
content += page.extract_text() or ""
metadata["num_pages"] = len(reader.pages)
metadata["title"] = reader.metadata.title if reader.metadata and reader.metadata.title else os.path.basename(file_path)
elif file_extension in [".doc", ".docx"]:
# 解析Word
content = docx2txt.process(file_path)
metadata["title"] = os.path.basename(file_path)
elif file_extension == ".txt":
# 解析纯文本
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
metadata["title"] = os.path.basename(file_path)
else:
raise ValueError(f"不支持的文件格式: {file_extension}")
except Exception as e:
raise Exception(f"文档解析失败: {str(e)}")
return {"content": content, "metadata": metadata}
def process_document(self, file_path: str, metadata: Dict = None) -> List[Dict[str, Any]]:
"""处理文档的完整ETL流程解析、分割、向量化
Args:
file_path: 文件路径
metadata: 额外的元数据
Returns:
处理后的文档块列表
"""
# 1. 解析文档
parsed_result = self.parse_document(file_path)
content = parsed_result["content"]
doc_metadata = parsed_result["metadata"]
# 合并额外元数据
if metadata:
doc_metadata.update(metadata)
# 2. 文本分割
chunks = self.text_splitter.split_text(content)
# 3. 处理每个文档块
processed_chunks = []
for i, chunk in enumerate(chunks):
chunk_id = str(uuid.uuid4())
chunk_metadata = {
"id": chunk_id,
"parent_file": file_path,
"chunk_index": i,
"total_chunks": len(chunks),
"timestamp": datetime.now().isoformat(),
"version": 1,
**doc_metadata
}
processed_chunks.append({
"id": chunk_id,
"content": chunk,
"metadata": chunk_metadata
})
return processed_chunks
def add_document(self, content: str = None, file_path: str = None, metadata: Dict = None) -> List[str]:
"""添加文档到知识库
Args:
content: 文档内容如果提供file_path则可选
file_path: 文件路径如果提供content则可选
metadata: 文档元数据
Returns:
文档块ID列表
"""
if not content and not file_path:
raise ValueError("必须提供content或file_path")
processed_chunks = []
if file_path:
# 通过文件路径处理文档
processed_chunks = self.process_document(file_path, metadata)
else:
# 直接处理内容
chunks = self.text_splitter.split_text(content)
doc_metadata = {
"timestamp": datetime.now().isoformat(),
"version": 1,
"file_type": "text",
"title": metadata.get("title", "直接输入内容") if metadata else "直接输入内容",
**(metadata or {})
}
for i, chunk in enumerate(chunks):
chunk_id = str(uuid.uuid4())
chunk_metadata = {
"id": chunk_id,
"parent_file": "direct_input",
"chunk_index": i,
"total_chunks": len(chunks),
**doc_metadata
}
processed_chunks.append({
"id": chunk_id,
"content": chunk,
"metadata": chunk_metadata
})
# 添加到向量数据库和稀疏索引
chunk_ids = []
# 1. 准备所有数据用于批量添加
all_documents = []
all_metadatas = []
all_ids = []
for chunk in processed_chunks:
chunk_id = chunk["id"]
chunk_content = chunk["content"]
chunk_metadata = chunk["metadata"]
all_documents.append(chunk_content)
all_metadatas.append(chunk_metadata)
all_ids.append(chunk_id)
# 保存元数据
self.document_metadata[chunk_id] = chunk_metadata
chunk_ids.append(chunk_id)
# 2. 批量添加到向量数据库
if all_documents:
self.collection.add(
documents=all_documents,
metadatas=all_metadatas,
ids=all_ids
)
# 3. 批量添加到稀疏索引Whoosh
writer = self.sparse_index.writer()
for i in range(len(processed_chunks)):
chunk = processed_chunks[i]
writer.add_document(
doc_id=all_ids[i],
content=all_documents[i],
title=all_metadatas[i].get("title", ""),
file_path=all_metadatas[i].get("file_path", ""),
timestamp=all_metadatas[i].get("timestamp", "")
)
writer.commit()
self.save_metadata()
return chunk_ids
def update_document(self, document_id: str, content: str = None, file_path: str = None, metadata: Dict = None) -> bool:
"""更新文档
Args:
document_id: 文档块ID
content: 新的文档内容如果提供file_path则可选
file_path: 新的文件路径如果提供content则可选
metadata: 新的元数据
Returns:
是否更新成功
"""
if document_id not in self.document_metadata:
return False
current_metadata = self.document_metadata[document_id]
# 1. 删除旧的文档块
self.delete_document(document_id)
# 2. 添加新的文档内容或文件
if content:
chunks = self.text_splitter.split_text(content)
for i, chunk in enumerate(chunks):
chunk_id = document_id if i == 0 else str(uuid.uuid4())
new_metadata = {
"id": chunk_id,
"parent_file": current_metadata.get("parent_file"),
"chunk_index": i,
"total_chunks": len(chunks),
"timestamp": datetime.now().isoformat(),
"version": current_metadata.get("version", 1) + 1,
**current_metadata,
**(metadata or {})
}
# 添加到向量数据库
self.collection.add(
documents=[chunk],
metadatas=[new_metadata],
ids=[chunk_id]
)
# 添加到稀疏索引
writer = self.sparse_index.writer()
writer.add_document(
doc_id=chunk_id,
content=chunk,
title=new_metadata.get("title", ""),
file_path=new_metadata.get("file_path", ""),
timestamp=new_metadata.get("timestamp", "")
)
writer.commit()
# 保存元数据
self.document_metadata[chunk_id] = new_metadata
elif file_path:
# 重新处理文件
processed_chunks = self.process_document(file_path, {
"version": current_metadata.get("version", 1) + 1,
**current_metadata,
**(metadata or {})
})
for chunk in processed_chunks:
chunk_id = document_id if "chunk_index" in chunk["metadata"] and chunk["metadata"]["chunk_index"] == 0 else str(uuid.uuid4())
chunk["metadata"]["id"] = chunk_id
chunk["metadata"]["version"] = current_metadata.get("version", 1) + 1
# 添加到向量数据库
self.collection.add(
documents=[chunk["content"]],
metadatas=[chunk["metadata"]],
ids=[chunk_id]
)
# 添加到稀疏索引
writer = self.sparse_index.writer()
writer.add_document(
doc_id=chunk_id,
content=chunk["content"],
title=chunk["metadata"].get("title", ""),
file_path=chunk["metadata"].get("file_path", ""),
timestamp=chunk["metadata"].get("timestamp", "")
)
writer.commit()
# 保存元数据
self.document_metadata[chunk_id] = chunk["metadata"]
self.save_metadata()
return True
def delete_document(self, document_id: str) -> bool:
"""删除文档
Args:
document_id: 文档块ID
Returns:
是否删除成功
"""
if document_id not in self.document_metadata:
return False
# 1. 从向量数据库删除
self.collection.delete(ids=[document_id])
# 2. 从稀疏索引删除
writer = self.sparse_index.writer()
writer.delete_by_term("doc_id", document_id)
writer.commit()
# 3. 删除元数据
del self.document_metadata[document_id]
self.save_metadata()
return True
def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
"""获取文档信息
Args:
document_id: 文档ID
Returns:
文档信息
"""
return self.document_metadata.get(document_id)
def list_documents(self) -> List[Dict[str, Any]]:
"""列出所有文档
Returns:
文档列表
"""
return list(self.document_metadata.values())
def search(self, query: str, n_results: int = 5, hybrid_weight: float = 0.5) -> List[Dict[str, Any]]:
"""混合搜索文档(密集向量+稀疏关键词)
Args:
query: 搜索查询
n_results: 返回结果数量
hybrid_weight: 混合权重0=纯稀疏1=纯密集
Returns:
搜索结果列表已重排序
"""
# 1. 密集向量搜索ChromaDB
dense_results = self.collection.query(
query_texts=[query],
n_results=n_results * 2, # 获取更多结果用于重排序
include=["documents", "metadatas", "distances"]
)
# 格式化密集搜索结果
dense_formatted = []
for i in range(len(dense_results["ids"][0])):
dense_formatted.append({
"id": dense_results["ids"][0][i],
"content": dense_results["documents"][0][i],
"metadata": dense_results["metadatas"][0][i],
"dense_score": 1.0 / (1.0 + dense_results["distances"][0][i]), # 转换为相似度分数
"sparse_score": 0.0
})
# 2. 稀疏关键词搜索Whoosh
sparse_results = []
with self.sparse_index.searcher() as searcher:
parser = MultifieldParser(["content", "title"], schema=self.schema)
whoosh_query = parser.parse(query)
whoosh_results = searcher.search(whoosh_query, limit=n_results * 2)
for result in whoosh_results:
doc_id = result["doc_id"]
if doc_id in self.document_metadata:
sparse_results.append({
"id": doc_id,
"content": result["content"],
"metadata": self.document_metadata[doc_id],
"dense_score": 0.0,
"sparse_score": result.score
})
# 3. 合并结果
all_results = {}
# 添加密集搜索结果
for result in dense_formatted:
all_results[result["id"]] = result
# 添加或更新稀疏搜索结果
for result in sparse_results:
if result["id"] in all_results:
all_results[result["id"]]["sparse_score"] = result["sparse_score"]
else:
all_results[result["id"]] = result
# 4. 重排序(混合分数)
def calculate_hybrid_score(result):
return (hybrid_weight * result["dense_score"]) + ((1 - hybrid_weight) * result["sparse_score"])
sorted_results = sorted(
all_results.values(),
key=calculate_hybrid_score,
reverse=True
)[:n_results]
# 5. 格式化最终结果
final_results = []
for result in sorted_results:
final_results.append({
"id": result["id"],
"content": result["content"],
"metadata": result["metadata"],
"dense_score": result["dense_score"],
"sparse_score": result["sparse_score"],
"hybrid_score": calculate_hybrid_score(result)
})
return final_results
def clear(self):
"""清空知识库"""
self.collection.delete()
self.document_metadata = {}
self.save_metadata()
# 创建全局知识库实例
global_knowledge_base = None
def get_knowledge_base() -> KnowledgeBase:
"""获取知识库实例(单例模式)"""
global global_knowledge_base
if global_knowledge_base is None:
global_knowledge_base = KnowledgeBase()
return global_knowledge_base

Binary file not shown.

Binary file not shown.

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

26
pyproject.toml Normal file
View File

@ -0,0 +1,26 @@
[project]
name = "intelligent-knowledge-base"
version = "0.1.0"
description = "智能知识库问答系统 - 端到端RAG解决方案"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"flask>=3.1.2",
"openai>=2.14.0",
"pydantic>=2.12.5",
"python-dotenv>=1.2.1",
"chromadb>=0.5.0",
"langchain>=0.2.0",
"langchain-community>=0.2.0",
"langchain-openai>=0.1.0",
"pypdf>=4.0.0",
"docx2txt>=0.8",
"numpy>=1.26.0",
"tqdm>=4.66.0",
"scikit-learn>=1.4.0",
"jieba>=0.42.1",
"whoosh>=2.7.4",
]
[project.scripts]
run = "app:main"

501
templates/index.html Normal file
View File

@ -0,0 +1,501 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>智能知识库问答系统</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
background: #f5f5f7;
min-height: 100vh;
padding: 20px;
color: #1d1d1f;
}
.container {
max-width: 800px;
margin: 40px auto;
background: white;
border-radius: 18px;
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.08);
padding: 40px;
}
h1 {
text-align: center;
color: #1d1d1f;
margin-bottom: 10px;
font-size: 2.2em;
font-weight: 700;
}
h3 {
color: #1d1d1f;
margin-bottom: 24px;
font-size: 1.3em;
font-weight: 600;
}
.section {
margin-bottom: 40px;
padding: 24px;
background: #ffffff;
border-radius: 12px;
border: 1px solid #e6e6e6;
}
.form-group {
margin-bottom: 24px;
}
label {
display: block;
font-size: 1em;
margin-bottom: 8px;
color: #86868b;
font-weight: 500;
}
input[type="text"], input[type="file"] {
width: 100%;
padding: 12px 16px;
font-size: 1.05em;
border: 1px solid #d2d2d7;
border-radius: 8px;
transition: border-color 0.2s, background-color 0.2s;
background: #ffffff;
}
input[type="text"]:focus, input[type="file"]:focus {
outline: none;
border-color: #0071e3;
background: #ffffff;
}
button {
display: inline-block;
padding: 12px 24px;
font-size: 1em;
background: #0071e3;
color: white;
border: none;
border-radius: 980px;
cursor: pointer;
transition: background-color 0.2s;
font-weight: 600;
margin-right: 10px;
}
button:hover {
background: #0077ed;
}
button:disabled {
background: #d1d1d6;
color: #86868b;
cursor: not-allowed;
}
.response-box {
margin-top: 16px;
padding: 20px;
background: white;
border-radius: 12px;
border: 1px solid #e6e6e6;
min-height: 120px;
font-size: 1em;
line-height: 1.7;
color: #1d1d1f;
white-space: pre-wrap;
}
.loading {
display: inline-block;
width: 16px;
height: 16px;
border: 2px solid rgba(255,255,255,.3);
border-radius: 50%;
border-top-color: #fff;
animation: spin 1s linear infinite;
margin-right: 8px;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
#document-list {
margin-top: 16px;
}
.document-item {
padding: 16px;
background: white;
border-radius: 12px;
margin-bottom: 8px;
border: 1px solid #e6e6e6;
display: flex;
justify-content: space-between;
align-items: center;
}
.document-info {
flex: 1;
}
.document-title {
font-weight: 600;
color: #1d1d1f;
margin-bottom: 4px;
}
.document-meta {
font-size: 0.9em;
color: #86868b;
}
.delete-btn {
background: #ff453a;
padding: 8px 16px;
font-size: 0.9em;
}
.delete-btn:hover {
background: #ff3b30;
}
.success-message, .error-message {
padding: 12px 16px;
border-radius: 12px;
margin-bottom: 16px;
font-weight: 600;
}
.success-message {
background: #32d74b;
color: white;
}
.error-message {
background: #ff453a;
color: white;
}
@media (max-width: 768px) {
body {
padding: 16px;
}
.container {
padding: 24px;
margin: 16px auto;
}
h1 {
font-size: 1.8em;
}
.section {
padding: 16px;
}
button {
width: 100%;
margin-right: 0;
margin-bottom: 12px;
}
.document-item {
flex-direction: column;
align-items: flex-start;
}
.delete-btn {
width: auto;
margin-top: 12px;
}
}
</style>
</head>
<body>
<div class="container">
<h1>智能知识库问答系统</h1>
<!-- 文档上传区 -->
<div class="section">
<h3>文档上传</h3>
<div id="upload-message"></div>
<div class="form-group">
<label for="file-upload">选择文档支持PDF、Word、TXT</label>
<input type="file" id="file-upload" accept=".pdf,.doc,.docx,.txt">
</div>
<div class="form-group">
<label for="document-title">文档标题(可选):</label>
<input type="text" id="document-title" placeholder="为文档添加标题...">
</div>
<button id="upload-btn" onclick="uploadDocument()">上传文档</button>
</div>
<!-- 知识库管理区 -->
<div class="section">
<h3>知识库管理</h3>
<button id="refresh-btn" onclick="loadDocuments()">刷新文档列表</button>
<div id="document-list">
<p>点击刷新按钮查看已上传的文档...</p>
</div>
</div>
<!-- 问答区 -->
<div class="section">
<h3>知识库问答</h3>
<div class="form-group">
<label for="question">请输入您的问题:</label>
<input type="text" id="question" placeholder="例如什么是Python">
</div>
<button id="ask-btn" onclick="askQuestion()">提问</button>
<div class="response-box" id="answer">
输入问题并点击提问按钮获取答案...
</div>
</div>
</div>
<script>
// 页面加载时加载文档列表
document.addEventListener('DOMContentLoaded', function() {
loadDocuments();
});
// 上传文档
function uploadDocument() {
const fileInput = document.getElementById('file-upload');
const titleInput = document.getElementById('document-title');
const uploadBtn = document.getElementById('upload-btn');
const uploadMessage = document.getElementById('upload-message');
// 检查是否选择了文件
if (!fileInput.files || fileInput.files.length === 0) {
showMessage(uploadMessage, '请选择一个文件', 'error');
return;
}
const file = fileInput.files[0];
const title = titleInput.value;
// 重置消息
uploadMessage.innerHTML = '';
// 禁用按钮并显示加载状态
uploadBtn.disabled = true;
uploadBtn.innerHTML = '<span class="loading"></span>正在上传...';
// 创建表单数据
const formData = new FormData();
formData.append('file', file);
if (title) {
formData.append('title', title);
}
// 发送请求
fetch('/upload', {
method: 'POST',
body: formData
})
.then(response => response.json())
.then(data => {
if (data.success) {
showMessage(uploadMessage, `文档上传成功!已处理 ${data.count} 个文档块`, 'success');
// 重置表单
fileInput.value = '';
titleInput.value = '';
// 更新文档列表
loadDocuments();
} else {
showMessage(uploadMessage, `上传失败:${data.error}`, 'error');
}
})
.catch(error => {
showMessage(uploadMessage, `上传失败:${error.message}`, 'error');
})
.finally(() => {
// 恢复按钮状态
uploadBtn.disabled = false;
uploadBtn.innerHTML = '上传文档';
});
}
// 加载文档列表
function loadDocuments() {
const documentList = document.getElementById('document-list');
const refreshBtn = document.getElementById('refresh-btn');
// 禁用按钮并显示加载状态
refreshBtn.disabled = true;
refreshBtn.innerHTML = '<span class="loading"></span>正在加载...';
// 发送请求
fetch('/documents')
.then(response => response.json())
.then(data => {
if (data.success) {
if (data.documents.length === 0) {
documentList.innerHTML = '<p>知识库中暂无文档,请先上传文档...</p>';
} else {
// 按文档分组显示
const documentsByFile = {};
data.documents.forEach(doc => {
const fileName = doc.parent_file || '未知文件';
if (!documentsByFile[fileName]) {
documentsByFile[fileName] = [];
}
documentsByFile[fileName].push(doc);
});
let html = '';
for (const fileName in documentsByFile) {
const docs = documentsByFile[fileName];
const firstDoc = docs[0];
html += `
<div class="document-item">
<div class="document-info">
<div class="document-title">${firstDoc.metadata?.title || fileName}</div>
<div class="document-meta">
文档块数量: ${docs.length} | 上传时间: ${new Date(firstDoc.timestamp).toLocaleString()}
</div>
</div>
<button class="delete-btn" onclick="deleteDocument('${firstDoc.id}')">删除</button>
</div>
`;
}
documentList.innerHTML = html;
}
} else {
documentList.innerHTML = `<p class="error-message">加载失败:${data.error}</p>`;
}
})
.catch(error => {
documentList.innerHTML = `<p class="error-message">加载失败:${error.message}</p>`;
})
.finally(() => {
// 恢复按钮状态
refreshBtn.disabled = false;
refreshBtn.innerHTML = '刷新文档列表';
});
}
// 删除文档
function deleteDocument(documentId) {
if (!confirm('确定要删除这个文档吗?')) {
return;
}
// 发送请求
fetch(`/documents/${documentId}`, {
method: 'DELETE'
})
.then(response => response.json())
.then(data => {
if (data.success) {
// 更新文档列表
loadDocuments();
} else {
alert(`删除失败:${data.error}`);
}
})
.catch(error => {
alert(`删除失败:${error.message}`);
});
}
// 提问
function askQuestion() {
const questionInput = document.getElementById('question');
const askBtn = document.getElementById('ask-btn');
const answerBox = document.getElementById('answer');
const question = questionInput.value.trim();
if (!question) {
alert('请输入问题');
return;
}
// 重置内容
answerBox.textContent = '';
// 禁用按钮并显示加载状态
askBtn.disabled = true;
askBtn.innerHTML = '<span class="loading"></span>正在思考...';
// 发送请求
fetch('/ask', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ query: question })
})
.then(response => {
if (!response.ok) {
throw new Error('问答失败');
}
// 处理流式响应
const reader = response.body.getReader();
const decoder = new TextDecoder('utf-8');
function read() {
return reader.read().then(({ done, value }) => {
if (done) {
return;
}
// 解码并显示内容
const chunk = decoder.decode(value, { stream: true });
answerBox.textContent += chunk;
// 继续读取
return read();
});
}
return read();
})
.catch(error => {
answerBox.textContent = `问答失败:${error.message}`;
})
.finally(() => {
// 恢复按钮状态
askBtn.disabled = false;
askBtn.innerHTML = '提问';
});
}
// 显示消息
function showMessage(element, message, type) {
const messageElement = document.createElement('div');
messageElement.className = type === 'success' ? 'success-message' : 'error-message';
messageElement.textContent = message;
element.innerHTML = '';
element.appendChild(messageElement);
// 3秒后自动隐藏
setTimeout(() => {
messageElement.remove();
}, 3000);
}
// 按下Enter键也可以提问
document.getElementById('question').addEventListener('keypress', function(e) {
if (e.key === 'Enter') {
askQuestion();
}
});
</script>
</body>
</html>

2931
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff