9 місяців тому · 237708cb64
--- a/111.py
+++ b/111.py
--- a/app.py
+++ b/app.py
--- a/app.spec
+++ b/app.spec
@@ -1,38 +0,0 @@
 
				-# -*- mode: python ; coding: utf-8 -*-
			
 
				-
			
 
				-
			
 
				-a = Analysis(
			
 
				-    ['app.py'],
			
 
				-    pathex=[],
			
 
				-    binaries=[],
			
 
				-    datas=[],
			
 
				-    hiddenimports=[],
			
 
				-    hookspath=[],
			
 
				-    hooksconfig={},
			
 
				-    runtime_hooks=[],
			
 
				-    excludes=[],
			
 
				-    noarchive=False,
			
 
				-    optimize=0,
			
 
				-)
			
 
				-pyz = PYZ(a.pure)
			
 
				-
			
 
				-exe = EXE(
			
 
				-    pyz,
			
 
				-    a.scripts,
			
 
				-    a.binaries,
			
 
				-    a.datas,
			
 
				-    [],
			
 
				-    name='app',
			
 
				-    debug=False,
			
 
				-    bootloader_ignore_signals=False,
			
 
				-    strip=False,
			
 
				-    upx=True,
			
 
				-    upx_exclude=[],
			
 
				-    runtime_tmpdir=None,
			
 
				-    console=True,
			
 
				-    disable_windowed_traceback=False,
			
 
				-    argv_emulation=False,
			
 
				-    target_arch=None,
			
 
				-    codesign_identity=None,
			
 
				-    entitlements_file=None,
			
 
				-)
			
--- a/app/services/chat_service.py
+++ b/app/services/chat_service.py
@@ -83,7 +83,7 @@ def clear_chat_history():
 
				 def create_chat(msg,type):
			
 
				     # msg = data['msg']
			
 
				     msg = replace_word(msg, target_word)
			
 
				-    words_to_replace1 = ["爆破", "爆坡"]
			
 
				+    words_to_replace1 = ["爆破", "爆坡","鲍坡"]
			
 
				     for word in words_to_replace1:
			
 
				         msg = msg.replace(word, "抱坡")
			
 
				     print(msg)
			
--- a/embed.py
+++ b/embed.py
@@ -1,48 +0,0 @@
 
				-import os
			
 
				-from datetime import datetime
			
 
				-from werkzeug.utils import secure_filename
			
 
				-from langchain_community.document_loaders import UnstructuredPDFLoader
			
 
				-from langchain_text_splitters import RecursiveCharacterTextSplitter
			
 
				-from get_vector_db import get_vector_db
			
 
				-
			
 
				-TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp')
			
 
				-
			
 
				-# Function to check if the uploaded file is allowed (only PDF files)
			
 
				-def allowed_file(filename):
			
 
				-    return '.' in filename and filename.rsplit('.', 1)[1].lower() in {'pdf'}
			
 
				-
			
 
				-# Function to save the uploaded file to the temporary folder
			
 
				-def save_file(file):
			
 
				-    # Save the uploaded file with a secure filename and return the file path
			
 
				-    ct = datetime.now()
			
 
				-    ts = ct.timestamp()
			
 
				-    filename = str(ts) + "_" + secure_filename(file.filename)
			
 
				-    file_path = os.path.join(TEMP_FOLDER, filename)
			
 
				-    file.save(file_path)
			
 
				-
			
 
				-    return file_path
			
 
				-
			
 
				-# Function to load and split the data from the PDF file
			
 
				-def load_and_split_data(file_path):
			
 
				-    # Load the PDF file and split the data into chunks
			
 
				-    loader = UnstructuredPDFLoader(file_path=file_path)
			
 
				-    data = loader.load()
			
 
				-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
			
 
				-    chunks = text_splitter.split_documents(data)
			
 
				-
			
 
				-    return chunks
			
 
				-
			
 
				-# Main function to handle the embedding process
			
 
				-def embed(file):
			
 
				-    # Check if the file is valid, save it, load and split the data, add to the database, and remove the temporary file
			
 
				-    if file.filename != '' and file and allowed_file(file.filename):
			
 
				-        file_path = save_file(file)
			
 
				-        chunks = load_and_split_data(file_path)
			
 
				-        db = get_vector_db()
			
 
				-        db.add_documents(chunks)
			
 
				-        db.persist()
			
 
				-        os.remove(file_path)
			
 
				-
			
 
				-        return True
			
 
				-
			
 
				-    return False
			
--- a/get_vector_db.py
+++ b/get_vector_db.py
@@ -1,18 +0,0 @@
 
				-import os
			
 
				-from langchain_community.embeddings import OllamaEmbeddings
			
 
				-from langchain_community.vectorstores.chroma import Chroma
			
 
				-
			
 
				-CHROMA_PATH = os.getenv('CHROMA_PATH', 'chroma')
			
 
				-COLLECTION_NAME = os.getenv('COLLECTION_NAME', 'siwei_ai')
			
 
				-TEXT_EMBEDDING_MODEL = os.getenv('TEXT_EMBEDDING_MODEL', 'nomic-embed-text')
			
 
				-
			
 
				-def get_vector_db():
			
 
				-    embedding = OllamaEmbeddings(model=TEXT_EMBEDDING_MODEL,show_progress=True,num_gpu=0,num_thread=4)
			
 
				-
			
 
				-    db = Chroma(
			
 
				-        collection_name=COLLECTION_NAME,
			
 
				-        persist_directory=CHROMA_PATH,
			
 
				-        embedding_function=embedding
			
 
				-    )
			
 
				-
			
 
				-    return db
			
--- a/query.py
+++ b/query.py
@@ -1,62 +0,0 @@
 
				-import os
			
 
				-from langchain_community.chat_models import ChatOllama
			
 
				-from langchain.prompts import ChatPromptTemplate, PromptTemplate
			
 
				-from langchain_core.output_parsers import StrOutputParser
			
 
				-from langchain_core.runnables import RunnablePassthrough
			
 
				-from langchain.retrievers.multi_query import MultiQueryRetriever
			
 
				-from get_vector_db import get_vector_db
			
 
				-
			
 
				-LLM_MODEL = os.getenv('LLM_MODEL', 'qwen2:7b')
			
 
				-
			
 
				-# Function to get the prompt templates for generating alternative questions and answering based on context
			
 
				-def get_prompt():
			
 
				-    QUERY_PROMPT = PromptTemplate(
			
 
				-        input_variables=["question"],
			
 
				-        template="""你是一名AI语言模型助理。你的任务是生成三个
			
 
				-        从中检索相关文档的给定用户问题的不同版本
			
 
				-        矢量数据库。通过对用户问题生成多个视角
			
 
				-        目标是帮助用户克服基于距离的一些局限性
			
 
				-        相似性搜索。请提供这些用换行符分隔的备选问题。
			
 
				-        Original question: {question}""",
			
 
				-    )
			
 
				-
			
 
				-    template = """仅根据以下上下文用中文回答问题：
			
 
				-    {context},请严格以markdown格式输出并保障寄送格式正确无误，
			
 
				-    Question: {question}
			
 
				-    """
			
 
				-    # Question: {question}
			
 
				-
			
 
				-
			
 
				-    prompt = ChatPromptTemplate.from_template(template)
			
 
				-    return QUERY_PROMPT, prompt
			
 
				-
			
 
				-# Main function to handle the query process
			
 
				-def query(input):
			
 
				-    if input:
			
 
				-        # Initialize the language model with the specified model name
			
 
				-        llm = ChatOllama(model=LLM_MODEL,keep_alive=-1,num_gpu=0)
			
 
				-        # Get the vector database instance
			
 
				-        db = get_vector_db()
			
 
				-        # Get the prompt templates
			
 
				-        QUERY_PROMPT, prompt = get_prompt()
			
 
				-
			
 
				-        # Set up the retriever to generate multiple queries using the language model and the query prompt
			
 
				-        retriever = MultiQueryRetriever.from_llm(
			
 
				-            db.as_retriever(), 
			
 
				-            llm,
			
 
				-            prompt=QUERY_PROMPT
			
 
				-        )
			
 
				-
			
 
				-        # Define the processing chain to retrieve context, generate the answer, and parse the output
			
 
				-        chain = (
			
 
				-            {"context": retriever, "question": RunnablePassthrough()}
			
 
				-            | prompt
			
 
				-            | llm
			
 
				-            | StrOutputParser()
			
 
				-        )
			
 
				-
			
 
				-        response = chain.invoke(input)
			
 
				-        
			
 
				-        return response
			
 
				-
			
 
				-    return None
			
--- a/vocal.py
+++ b/vocal.py
@@ -1,58 +0,0 @@
 
				-# from modelscope.pipelines import pipeline
			
 
				-# from modelscope.utils.constant import Tasks
			
 
				-# import time
			
 
				-# import torch
			
 
				-
			
 
				-# # print(torch.__version__) # 查看torch当前版本号
			
 
				-
			
 
				-# # print(torch.version.cuda) # 编译当前版本的torch使用的cuda版本号
			
 
				-
			
 
				-# # print(torch.cuda.is_available()) # 查看当前cuda是否可用于当前版本的Torch，如果输出True，则表示可用
			
 
				-
			
 
				-
			
 
				-
			
 
				-# def voice_text(input_video_path,model='iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'):
			
 
				-#     inference_pipeline = pipeline(
			
 
				-#     task=Tasks.auto_speech_recognition,
			
 
				-#     # model='iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
			
 
				-#     model=model,
			
 
				-#     # model="model\punc_ct-transformer_cn-en-common-vocab471067-large",
			
 
				-#     model_revision="v2.0.4",
			
 
				-#     device='gpu')
			
 
				-
			
 
				-#     res = inference_pipeline(input_video_path)
			
 
				-#     # print(res)
			
 
				-#     texts = [item['text'] for item in res]
			
 
				-
			
 
				-#     # print(texts)
			
 
				-#     result = ' '.join(texts)
			
 
				-#     return result
			
 
				-
			
 
				-# if  __name__ == "__main__":
			
 
				-#     start_time = time.time()
			
 
				-#     inference_pipeline = pipeline(
			
 
				-#         task=Tasks.auto_speech_recognition,
			
 
				-#         # model='iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
			
 
				-#         model='iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
			
 
				-#         # model="model\punc_ct-transformer_cn-en-common-vocab471067-large",
			
 
				-#         model_revision="v2.0.4",
			
 
				-#         device='gpu')
			
 
				-
			
 
				-#     # rec_result = inference_pipeline('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav')
			
 
				-
			
 
				-#     # 替换为本地语音文件路径
			
 
				-#     local_audio_path = 'data/audio/5bf77846-0193-4f35-92f7-09ce51ee3793.mp3'
			
 
				-#     res = inference_pipeline(local_audio_path)
			
 
				-#     # print(res)
			
 
				-#     texts = [item['text'] for item in res]
			
 
				-
			
 
				-#     # print(texts)
			
 
				-#     result = ' '.join(texts)
			
 
				-#     print(result)
			
 
				-
			
 
				-
			
 
				-#     end_time = time.time()
			
 
				-#     # 计算时间差
			
 
				-#     elapsed_time = end_time - start_time
			
 
				-
			
 
				-#     print(f"耗时: {elapsed_time} 秒")
			
--- a/voice_translation_test.py
+++ b/voice_translation_test.py
@@ -1,52 +0,0 @@
 
				-# from funasr import AutoModel
			
 
				-# import time
			
 
				-
			
 
				-# def vocal_text(input_video_path):
			
 
				-#     model = AutoModel(model="./Voice_translation", model_revision="v2.0.4",
			
 
				-#                     vad_model="./Endpoint_detection", vad_model_revision="v2.0.4",
			
 
				-#                     punc_model="./Ct_punc", punc_model_revision="v2.0.4",
			
 
				-#                     use_cuda=True,use_fast = True,
			
 
				-#                     )
			
 
				-#     res = model.generate(input_video_path,
			
 
				-#                 batch_size_s=30,
			
 
				-#                 hotword='test')
			
 
				-
			
 
				-
			
 
				-#     texts = [item['text'] for item in res]
			
 
				-
			
 
				-
			
 
				-#     result = ' '.join(texts)
			
 
				-#     return result
			
 
				-
			
 
				-
			
 
				-# if  __name__ == "__main__":
			
 
				-#     start_time = time.time()
			
 
				-
			
 
				-
			
 
				-#     model = AutoModel(model="./Voice_translation", model_revision="v2.0.4",
			
 
				-#                     vad_model="./Endpoint_detection", vad_model_revision="v2.0.4",
			
 
				-#                     punc_model="./Ct_punc", punc_model_revision="v2.0.4",
			
 
				-#                     )
			
 
				-#     res = model.generate(input="./data/audio/5bf77846-0193-4f35-92f7-09ce51ee3793.mp3",
			
 
				-#                 batch_size_s=30,
			
 
				-#                 hotword='test')
			
 
				-
			
 
				-#     print(res)
			
 
				-#     texts = [item['text'] for item in res]
			
 
				-
			
 
				-#     print(texts)
			
 
				-#     result = ' '.join(texts)
			
 
				-#     print(result)
			
 
				-
			
 
				-
			
 
				-# # def save(input,savepath):
			
 
				-# #     outputs = open(savepath, 'w', encoding='utf-8')
			
 
				-# #     outputs.write(input+'\n')
			
 
				-# #     outputs.close()
			
 
				-# # save(input=result,savepath=r"F:\work\voice_translation\datasets\1.txt")
			
 
				-
			
 
				-#     end_time = time.time()
			
 
				-#     # 计算时间差
			
 
				-#     elapsed_time = end_time - start_time
			
 
				-
			
 
				-#     print(f"耗时: {elapsed_time} 秒")