1 year ago · 60e40c4707
--- a/.env.sample
+++ b/.env.sample
@@ -0,0 +1,5 @@
 
				+TEMP_FOLDER = './_temp'
			
 
				+CHROMA_PATH = "chroma"
			
 
				+COLLECTION_NAME = 'siwei_ai'
			
 
				+LLM_MODEL = 'qwen2:7b'
			
 
				+TEXT_EMBEDDING_MODEL = 'nomic-embed-text'
			
--- a/__pycache__/embed.cpython-310.pyc
+++ b/__pycache__/embed.cpython-310.pyc
--- a/__pycache__/get_vector_db.cpython-310.pyc
+++ b/__pycache__/get_vector_db.cpython-310.pyc
--- a/__pycache__/query.cpython-310.pyc
+++ b/__pycache__/query.cpython-310.pyc
--- a/app.py
+++ b/app.py
--- a/chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/data_level0.bin
+++ b/chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/data_level0.bin
--- a/chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/header.bin
+++ b/chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/header.bin
--- a/chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/length.bin
+++ b/chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/length.bin
--- a/chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/link_lists.bin
+++ b/chroma/cd7cb5a8-0622-4833-a6a2-d812be9d5da4/link_lists.bin
--- a/chroma/chroma.sqlite3
+++ b/chroma/chroma.sqlite3
--- a/embed.py
+++ b/embed.py
@@ -0,0 +1,48 @@
 
				+import os
			
 
				+from datetime import datetime
			
 
				+from werkzeug.utils import secure_filename
			
 
				+from langchain_community.document_loaders import UnstructuredPDFLoader
			
 
				+from langchain_text_splitters import RecursiveCharacterTextSplitter
			
 
				+from get_vector_db import get_vector_db
			
 
				+
			
 
				+TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp')
			
 
				+
			
 
				+# Function to check if the uploaded file is allowed (only PDF files)
			
 
				+def allowed_file(filename):
			
 
				+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in {'pdf'}
			
 
				+
			
 
				+# Function to save the uploaded file to the temporary folder
			
 
				+def save_file(file):
			
 
				+    # Save the uploaded file with a secure filename and return the file path
			
 
				+    ct = datetime.now()
			
 
				+    ts = ct.timestamp()
			
 
				+    filename = str(ts) + "_" + secure_filename(file.filename)
			
 
				+    file_path = os.path.join(TEMP_FOLDER, filename)
			
 
				+    file.save(file_path)
			
 
				+
			
 
				+    return file_path
			
 
				+
			
 
				+# Function to load and split the data from the PDF file
			
 
				+def load_and_split_data(file_path):
			
 
				+    # Load the PDF file and split the data into chunks
			
 
				+    loader = UnstructuredPDFLoader(file_path=file_path)
			
 
				+    data = loader.load()
			
 
				+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
			
 
				+    chunks = text_splitter.split_documents(data)
			
 
				+
			
 
				+    return chunks
			
 
				+
			
 
				+# Main function to handle the embedding process
			
 
				+def embed(file):
			
 
				+    # Check if the file is valid, save it, load and split the data, add to the database, and remove the temporary file
			
 
				+    if file.filename != '' and file and allowed_file(file.filename):
			
 
				+        file_path = save_file(file)
			
 
				+        chunks = load_and_split_data(file_path)
			
 
				+        db = get_vector_db()
			
 
				+        db.add_documents(chunks)
			
 
				+        db.persist()
			
 
				+        os.remove(file_path)
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    return False
			
--- a/get_vector_db.py
+++ b/get_vector_db.py
@@ -0,0 +1,18 @@
 
				+import os
			
 
				+from langchain_community.embeddings import OllamaEmbeddings
			
 
				+from langchain_community.vectorstores.chroma import Chroma
			
 
				+
			
 
				+CHROMA_PATH = os.getenv('CHROMA_PATH', 'chroma')
			
 
				+COLLECTION_NAME = os.getenv('COLLECTION_NAME', 'siwei_ai')
			
 
				+TEXT_EMBEDDING_MODEL = os.getenv('TEXT_EMBEDDING_MODEL', 'nomic-embed-text')
			
 
				+
			
 
				+def get_vector_db():
			
 
				+    embedding = OllamaEmbeddings(model=TEXT_EMBEDDING_MODEL,show_progress=True)
			
 
				+
			
 
				+    db = Chroma(
			
 
				+        collection_name=COLLECTION_NAME,
			
 
				+        persist_directory=CHROMA_PATH,
			
 
				+        embedding_function=embedding
			
 
				+    )
			
 
				+
			
 
				+    return db
			
--- a/query.py
+++ b/query.py
@@ -0,0 +1,61 @@
 
				+import os
			
 
				+from langchain_community.chat_models import ChatOllama
			
 
				+from langchain.prompts import ChatPromptTemplate, PromptTemplate
			
 
				+from langchain_core.output_parsers import StrOutputParser
			
 
				+from langchain_core.runnables import RunnablePassthrough
			
 
				+from langchain.retrievers.multi_query import MultiQueryRetriever
			
 
				+from get_vector_db import get_vector_db
			
 
				+
			
 
				+LLM_MODEL = os.getenv('LLM_MODEL', 'qwen2:7b')
			
 
				+
			
 
				+# Function to get the prompt templates for generating alternative questions and answering based on context
			
 
				+def get_prompt():
			
 
				+    QUERY_PROMPT = PromptTemplate(
			
 
				+        input_variables=["question"],
			
 
				+        template="""You are an AI language model assistant. Your task is to generate five
			
 
				+        different versions of the given user question to retrieve relevant documents from
			
 
				+        a vector database. By generating multiple perspectives on the user question, your
			
 
				+        goal is to help the user overcome some of the limitations of the distance-based
			
 
				+        similarity search. Provide these alternative questions separated by newlines.
			
 
				+        Original question: {question}""",
			
 
				+    )
			
 
				+
			
 
				+    template = """Answer the question in Chinese based ONLY on the following context:
			
 
				+    {context}
			
 
				+    Question: {question}
			
 
				+    """
			
 
				+
			
 
				+    prompt = ChatPromptTemplate.from_template(template)
			
 
				+
			
 
				+    return QUERY_PROMPT, prompt
			
 
				+
			
 
				+# Main function to handle the query process
			
 
				+def query(input):
			
 
				+    if input:
			
 
				+        # Initialize the language model with the specified model name
			
 
				+        llm = ChatOllama(model=LLM_MODEL)
			
 
				+        # Get the vector database instance
			
 
				+        db = get_vector_db()
			
 
				+        # Get the prompt templates
			
 
				+        QUERY_PROMPT, prompt = get_prompt()
			
 
				+
			
 
				+        # Set up the retriever to generate multiple queries using the language model and the query prompt
			
 
				+        retriever = MultiQueryRetriever.from_llm(
			
 
				+            db.as_retriever(), 
			
 
				+            llm,
			
 
				+            prompt=QUERY_PROMPT
			
 
				+        )
			
 
				+
			
 
				+        # Define the processing chain to retrieve context, generate the answer, and parse the output
			
 
				+        chain = (
			
 
				+            {"context": retriever, "question": RunnablePassthrough()}
			
 
				+            | prompt
			
 
				+            | llm
			
 
				+            | StrOutputParser()
			
 
				+        )
			
 
				+
			
 
				+        response = chain.invoke(input)
			
 
				+        
			
 
				+        return response
			
 
				+
			
 
				+    return None
			
--- a/requirements1.txt
+++ b/requirements1.txt
@@ -0,0 +1,190 @@
 
				+aiohttp==3.9.5
			
 
				+aiosignal==1.3.1
			
 
				+annotated-types==0.7.0
			
 
				+antlr4-python3-runtime==4.9.3
			
 
				+anyio==4.4.0
			
 
				+asgiref==3.8.1
			
 
				+attrs==23.2.0
			
 
				+backoff==2.2.1
			
 
				+bcrypt==4.1.3
			
 
				+beautifulsoup4==4.12.3
			
 
				+blinker==1.8.2
			
 
				+build==1.2.1
			
 
				+cachetools==5.3.3
			
 
				+certifi==2024.6.2
			
 
				+cffi==1.16.0
			
 
				+chardet==5.2.0
			
 
				+charset-normalizer==3.3.2
			
 
				+chroma-hnswlib==0.7.3
			
 
				+chromadb==0.5.3
			
 
				+click==8.1.7
			
 
				+coloredlogs==15.0.1
			
 
				+contourpy==1.2.1
			
 
				+cryptography==42.0.8
			
 
				+cycler==0.12.1
			
 
				+dataclasses-json==0.6.7
			
 
				+deepdiff==7.0.1
			
 
				+Deprecated==1.2.14
			
 
				+dnspython==2.6.1
			
 
				+effdet==0.4.1
			
 
				+email_validator==2.2.0
			
 
				+emoji==2.12.1
			
 
				+et-xmlfile==1.1.0
			
 
				+fastapi==0.111.0
			
 
				+fastapi-cli==0.0.4
			
 
				+filelock==3.15.4
			
 
				+filetype==1.2.0
			
 
				+Flask==3.0.3
			
 
				+flatbuffers==24.3.25
			
 
				+fonttools==4.53.0
			
 
				+frozenlist==1.4.1
			
 
				+fsspec==2024.6.1
			
 
				+google-api-core==2.19.1
			
 
				+google-auth==2.30.0
			
 
				+google-cloud-vision==3.7.2
			
 
				+googleapis-common-protos==1.63.2
			
 
				+grpcio==1.64.1
			
 
				+grpcio-status==1.62.2
			
 
				+h11==0.14.0
			
 
				+httpcore==1.0.5
			
 
				+httptools==0.6.1
			
 
				+httpx==0.27.0
			
 
				+huggingface-hub==0.23.4
			
 
				+humanfriendly==10.0
			
 
				+idna==3.7
			
 
				+importlib_metadata==7.1.0
			
 
				+importlib_resources==6.4.0
			
 
				+iopath==0.1.10
			
 
				+itsdangerous==2.2.0
			
 
				+Jinja2==3.1.4
			
 
				+joblib==1.4.2
			
 
				+jsonpatch==1.33
			
 
				+jsonpath-python==1.0.6
			
 
				+jsonpointer==3.0.0
			
 
				+kiwisolver==1.4.5
			
 
				+kubernetes==30.1.0
			
 
				+langchain==0.2.6
			
 
				+langchain-community==0.2.6
			
 
				+langchain-core==0.2.10
			
 
				+langchain-text-splitters==0.2.2
			
 
				+langdetect==1.0.9
			
 
				+langsmith==0.1.82
			
 
				+layoutparser==0.3.4
			
 
				+lxml==5.2.2
			
 
				+Markdown==3.6
			
 
				+markdown-it-py==3.0.0
			
 
				+MarkupSafe==2.1.5
			
 
				+marshmallow==3.21.3
			
 
				+matplotlib==3.9.0
			
 
				+mdurl==0.1.2
			
 
				+mmh3==4.1.0
			
 
				+monotonic==1.6
			
 
				+mpmath==1.3.0
			
 
				+multidict==6.0.5
			
 
				+mypy-extensions==1.0.0
			
 
				+nest-asyncio==1.6.0
			
 
				+networkx==3.3
			
 
				+nltk==3.8.1
			
 
				+numpy==1.26.4
			
 
				+oauthlib==3.2.2
			
 
				+olefile==0.47
			
 
				+omegaconf==2.3.0
			
 
				+onnx==1.16.1
			
 
				+onnxruntime==1.18.1
			
 
				+opencv-python==4.10.0.84
			
 
				+openpyxl==3.1.5
			
 
				+opentelemetry-api==1.25.0
			
 
				+opentelemetry-exporter-otlp-proto-common==1.25.0
			
 
				+opentelemetry-exporter-otlp-proto-grpc==1.25.0
			
 
				+opentelemetry-instrumentation==0.46b0
			
 
				+opentelemetry-instrumentation-asgi==0.46b0
			
 
				+opentelemetry-instrumentation-fastapi==0.46b0
			
 
				+opentelemetry-proto==1.25.0
			
 
				+opentelemetry-sdk==1.25.0
			
 
				+opentelemetry-semantic-conventions==0.46b0
			
 
				+opentelemetry-util-http==0.46b0
			
 
				+ordered-set==4.1.0
			
 
				+orjson==3.10.5
			
 
				+overrides==7.7.0
			
 
				+packaging==24.1
			
 
				+pandas==2.2.2
			
 
				+pdf2image==1.17.0
			
 
				+pdfminer.six==20231228
			
 
				+pdfplumber==0.11.1
			
 
				+pikepdf==9.0.0
			
 
				+pillow==10.3.0
			
 
				+pillow_heif==0.16.0
			
 
				+portalocker==2.10.0
			
 
				+posthog==3.5.0
			
 
				+proto-plus==1.24.0
			
 
				+protobuf==4.25.3
			
 
				+psutil==6.0.0
			
 
				+pyasn1==0.6.0
			
 
				+pyasn1_modules==0.4.0
			
 
				+pycocotools==2.0.8
			
 
				+pycparser==2.22
			
 
				+pydantic==2.7.4
			
 
				+pydantic_core==2.18.4
			
 
				+Pygments==2.18.0
			
 
				+pypandoc==1.13
			
 
				+pyparsing==3.1.2
			
 
				+pypdf==4.2.0
			
 
				+pypdfium2==4.30.0
			
 
				+PyPika==0.48.9
			
 
				+pyproject_hooks==1.1.0
			
 
				+pytesseract==0.3.10
			
 
				+python-dateutil==2.9.0.post0
			
 
				+python-docx==1.1.2
			
 
				+python-dotenv==1.0.1
			
 
				+python-iso639==2024.4.27
			
 
				+python-magic==0.4.27
			
 
				+python-multipart==0.0.9
			
 
				+python-oxmsg==0.0.1
			
 
				+python-pptx==0.6.23
			
 
				+pytz==2024.1
			
 
				+PyYAML==6.0.1
			
 
				+rapidfuzz==3.9.3
			
 
				+regex==2024.5.15
			
 
				+requests==2.32.3
			
 
				+requests-oauthlib==2.0.0
			
 
				+requests-toolbelt==1.0.0
			
 
				+rich==13.7.1
			
 
				+rsa==4.9
			
 
				+safetensors==0.4.3
			
 
				+scipy==1.14.0
			
 
				+shellingham==1.5.4
			
 
				+six==1.16.0
			
 
				+sniffio==1.3.1
			
 
				+soupsieve==2.5
			
 
				+SQLAlchemy==2.0.31
			
 
				+starlette==0.37.2
			
 
				+sympy==1.12.1
			
 
				+tabulate==0.9.0
			
 
				+tenacity==8.4.2
			
 
				+timm==1.0.7
			
 
				+tokenizers==0.19.1
			
 
				+# torch==2.3.1
			
 
				+# torchvision==0.18.1
			
 
				+tqdm==4.66.4
			
 
				+transformers==4.42.3
			
 
				+typer==0.12.3
			
 
				+typing-inspect==0.9.0
			
 
				+typing_extensions==4.12.2
			
 
				+tzdata==2024.1
			
 
				+ujson==5.10.0
			
 
				+unstructured==0.14.9
			
 
				+unstructured-client==0.23.8
			
 
				+unstructured-inference==0.7.36
			
 
				+unstructured.pytesseract==0.3.12
			
 
				+urllib3==2.2.2
			
 
				+uvicorn==0.30.1
			
 
				+# uvloop==0.19.0
			
 
				+watchfiles==0.22.0
			
 
				+websocket-client==1.8.0
			
 
				+websockets==12.0
			
 
				+Werkzeug==3.0.3
			
 
				+wrapt==1.16.0
			
 
				+xlrd==2.0.1
			
 
				+XlsxWriter==3.2.0
			
 
				+yarl==1.9.4
			
 
				+zipp==3.19.2