服务器 7 月之前
父节点
当前提交
60e40c4707

+ 5 - 0
.env.sample

@@ -0,0 +1,5 @@
+TEMP_FOLDER = './_temp'
+CHROMA_PATH = "chroma"
+COLLECTION_NAME = 'siwei_ai'
+LLM_MODEL = 'qwen2:7b'
+TEXT_EMBEDDING_MODEL = 'nomic-embed-text'

二进制
__pycache__/embed.cpython-310.pyc


二进制
__pycache__/get_vector_db.cpython-310.pyc


二进制
__pycache__/query.cpython-310.pyc


文件差异内容过多而无法显示
+ 55 - 2
app.py


二进制
chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/data_level0.bin


二进制
chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/header.bin


二进制
chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/length.bin


+ 0 - 0
chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/link_lists.bin


二进制
chroma/chroma.sqlite3


+ 48 - 0
embed.py

@@ -0,0 +1,48 @@
+import os
+from datetime import datetime
+from werkzeug.utils import secure_filename
+from langchain_community.document_loaders import UnstructuredPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from get_vector_db import get_vector_db
+
+TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp')
+
+# Function to check if the uploaded file is allowed (only PDF files)
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in {'pdf'}
+
+# Function to save the uploaded file to the temporary folder
+def save_file(file):
+    # Save the uploaded file with a secure filename and return the file path
+    ct = datetime.now()
+    ts = ct.timestamp()
+    filename = str(ts) + "_" + secure_filename(file.filename)
+    file_path = os.path.join(TEMP_FOLDER, filename)
+    file.save(file_path)
+
+    return file_path
+
+# Function to load and split the data from the PDF file
+def load_and_split_data(file_path):
+    # Load the PDF file and split the data into chunks
+    loader = UnstructuredPDFLoader(file_path=file_path)
+    data = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
+    chunks = text_splitter.split_documents(data)
+
+    return chunks
+
+# Main function to handle the embedding process
+def embed(file):
+    # Check if the file is valid, save it, load and split the data, add to the database, and remove the temporary file
+    if file.filename != '' and file and allowed_file(file.filename):
+        file_path = save_file(file)
+        chunks = load_and_split_data(file_path)
+        db = get_vector_db()
+        db.add_documents(chunks)
+        db.persist()
+        os.remove(file_path)
+
+        return True
+
+    return False

+ 18 - 0
get_vector_db.py

@@ -0,0 +1,18 @@
+import os
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain_community.vectorstores.chroma import Chroma
+
+CHROMA_PATH = os.getenv('CHROMA_PATH', 'chroma')
+COLLECTION_NAME = os.getenv('COLLECTION_NAME', 'siwei_ai')
+TEXT_EMBEDDING_MODEL = os.getenv('TEXT_EMBEDDING_MODEL', 'nomic-embed-text')
+
+def get_vector_db():
+    embedding = OllamaEmbeddings(model=TEXT_EMBEDDING_MODEL,show_progress=True)
+
+    db = Chroma(
+        collection_name=COLLECTION_NAME,
+        persist_directory=CHROMA_PATH,
+        embedding_function=embedding
+    )
+
+    return db

+ 61 - 0
query.py

@@ -0,0 +1,61 @@
+import os
+from langchain_community.chat_models import ChatOllama
+from langchain.prompts import ChatPromptTemplate, PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain.retrievers.multi_query import MultiQueryRetriever
+from get_vector_db import get_vector_db
+
+LLM_MODEL = os.getenv('LLM_MODEL', 'qwen2:7b')
+
+# Function to get the prompt templates for generating alternative questions and answering based on context
+def get_prompt():
+    QUERY_PROMPT = PromptTemplate(
+        input_variables=["question"],
+        template="""You are an AI language model assistant. Your task is to generate five
+        different versions of the given user question to retrieve relevant documents from
+        a vector database. By generating multiple perspectives on the user question, your
+        goal is to help the user overcome some of the limitations of the distance-based
+        similarity search. Provide these alternative questions separated by newlines.
+        Original question: {question}""",
+    )
+
+    template = """Answer the question in Chinese based ONLY on the following context:
+    {context}
+    Question: {question}
+    """
+
+    prompt = ChatPromptTemplate.from_template(template)
+
+    return QUERY_PROMPT, prompt
+
+# Main function to handle the query process
+def query(input):
+    if input:
+        # Initialize the language model with the specified model name
+        llm = ChatOllama(model=LLM_MODEL)
+        # Get the vector database instance
+        db = get_vector_db()
+        # Get the prompt templates
+        QUERY_PROMPT, prompt = get_prompt()
+
+        # Set up the retriever to generate multiple queries using the language model and the query prompt
+        retriever = MultiQueryRetriever.from_llm(
+            db.as_retriever(), 
+            llm,
+            prompt=QUERY_PROMPT
+        )
+
+        # Define the processing chain to retrieve context, generate the answer, and parse the output
+        chain = (
+            {"context": retriever, "question": RunnablePassthrough()}
+            | prompt
+            | llm
+            | StrOutputParser()
+        )
+
+        response = chain.invoke(input)
+        
+        return response
+
+    return None

+ 190 - 0
requirements1.txt

@@ -0,0 +1,190 @@
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+asgiref==3.8.1
+attrs==23.2.0
+backoff==2.2.1
+bcrypt==4.1.3
+beautifulsoup4==4.12.3
+blinker==1.8.2
+build==1.2.1
+cachetools==5.3.3
+certifi==2024.6.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+chroma-hnswlib==0.7.3
+chromadb==0.5.3
+click==8.1.7
+coloredlogs==15.0.1
+contourpy==1.2.1
+cryptography==42.0.8
+cycler==0.12.1
+dataclasses-json==0.6.7
+deepdiff==7.0.1
+Deprecated==1.2.14
+dnspython==2.6.1
+effdet==0.4.1
+email_validator==2.2.0
+emoji==2.12.1
+et-xmlfile==1.1.0
+fastapi==0.111.0
+fastapi-cli==0.0.4
+filelock==3.15.4
+filetype==1.2.0
+Flask==3.0.3
+flatbuffers==24.3.25
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.6.1
+google-api-core==2.19.1
+google-auth==2.30.0
+google-cloud-vision==3.7.2
+googleapis-common-protos==1.63.2
+grpcio==1.64.1
+grpcio-status==1.62.2
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.4
+humanfriendly==10.0
+idna==3.7
+importlib_metadata==7.1.0
+importlib_resources==6.4.0
+iopath==0.1.10
+itsdangerous==2.2.0
+Jinja2==3.1.4
+joblib==1.4.2
+jsonpatch==1.33
+jsonpath-python==1.0.6
+jsonpointer==3.0.0
+kiwisolver==1.4.5
+kubernetes==30.1.0
+langchain==0.2.6
+langchain-community==0.2.6
+langchain-core==0.2.10
+langchain-text-splitters==0.2.2
+langdetect==1.0.9
+langsmith==0.1.82
+layoutparser==0.3.4
+lxml==5.2.2
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.3
+matplotlib==3.9.0
+mdurl==0.1.2
+mmh3==4.1.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.3
+nltk==3.8.1
+numpy==1.26.4
+oauthlib==3.2.2
+olefile==0.47
+omegaconf==2.3.0
+onnx==1.16.1
+onnxruntime==1.18.1
+opencv-python==4.10.0.84
+openpyxl==3.1.5
+opentelemetry-api==1.25.0
+opentelemetry-exporter-otlp-proto-common==1.25.0
+opentelemetry-exporter-otlp-proto-grpc==1.25.0
+opentelemetry-instrumentation==0.46b0
+opentelemetry-instrumentation-asgi==0.46b0
+opentelemetry-instrumentation-fastapi==0.46b0
+opentelemetry-proto==1.25.0
+opentelemetry-sdk==1.25.0
+opentelemetry-semantic-conventions==0.46b0
+opentelemetry-util-http==0.46b0
+ordered-set==4.1.0
+orjson==3.10.5
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.2
+pdf2image==1.17.0
+pdfminer.six==20231228
+pdfplumber==0.11.1
+pikepdf==9.0.0
+pillow==10.3.0
+pillow_heif==0.16.0
+portalocker==2.10.0
+posthog==3.5.0
+proto-plus==1.24.0
+protobuf==4.25.3
+psutil==6.0.0
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+pycocotools==2.0.8
+pycparser==2.22
+pydantic==2.7.4
+pydantic_core==2.18.4
+Pygments==2.18.0
+pypandoc==1.13
+pyparsing==3.1.2
+pypdf==4.2.0
+pypdfium2==4.30.0
+PyPika==0.48.9
+pyproject_hooks==1.1.0
+pytesseract==0.3.10
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.0.1
+python-iso639==2024.4.27
+python-magic==0.4.27
+python-multipart==0.0.9
+python-oxmsg==0.0.1
+python-pptx==0.6.23
+pytz==2024.1
+PyYAML==6.0.1
+rapidfuzz==3.9.3
+regex==2024.5.15
+requests==2.32.3
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==13.7.1
+rsa==4.9
+safetensors==0.4.3
+scipy==1.14.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.5
+SQLAlchemy==2.0.31
+starlette==0.37.2
+sympy==1.12.1
+tabulate==0.9.0
+tenacity==8.4.2
+timm==1.0.7
+tokenizers==0.19.1
+# torch==2.3.1
+# torchvision==0.18.1
+tqdm==4.66.4
+transformers==4.42.3
+typer==0.12.3
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+unstructured==0.14.9
+unstructured-client==0.23.8
+unstructured-inference==0.7.36
+unstructured.pytesseract==0.3.12
+urllib3==2.2.2
+uvicorn==0.30.1
+# uvloop==0.19.0
+watchfiles==0.22.0
+websocket-client==1.8.0
+websockets==12.0
+Werkzeug==3.0.3
+wrapt==1.16.0
+xlrd==2.0.1
+XlsxWriter==3.2.0
+yarl==1.9.4
+zipp==3.19.2

部分文件因为文件数量过多而无法显示