|  | @@ -760,166 +760,168 @@ class DocumentService:
 | 
	
		
			
				|  |  |                      )
 | 
	
		
			
				|  |  |                  db.session.add(dataset_process_rule)
 | 
	
		
			
				|  |  |                  db.session.commit()
 | 
	
		
			
				|  |  | -            position = DocumentService.get_documents_position(dataset.id)
 | 
	
		
			
				|  |  | -            document_ids = []
 | 
	
		
			
				|  |  | -            duplicate_document_ids = []
 | 
	
		
			
				|  |  | -            if document_data["data_source"]["type"] == "upload_file":
 | 
	
		
			
				|  |  | -                upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"]
 | 
	
		
			
				|  |  | -                for file_id in upload_file_list:
 | 
	
		
			
				|  |  | -                    file = (
 | 
	
		
			
				|  |  | -                        db.session.query(UploadFile)
 | 
	
		
			
				|  |  | -                        .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
 | 
	
		
			
				|  |  | -                        .first()
 | 
	
		
			
				|  |  | -                    )
 | 
	
		
			
				|  |  | +            lock_name = "add_document_lock_dataset_id_{}".format(dataset.id)
 | 
	
		
			
				|  |  | +            with redis_client.lock(lock_name, timeout=600):
 | 
	
		
			
				|  |  | +                position = DocumentService.get_documents_position(dataset.id)
 | 
	
		
			
				|  |  | +                document_ids = []
 | 
	
		
			
				|  |  | +                duplicate_document_ids = []
 | 
	
		
			
				|  |  | +                if document_data["data_source"]["type"] == "upload_file":
 | 
	
		
			
				|  |  | +                    upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"]
 | 
	
		
			
				|  |  | +                    for file_id in upload_file_list:
 | 
	
		
			
				|  |  | +                        file = (
 | 
	
		
			
				|  |  | +                            db.session.query(UploadFile)
 | 
	
		
			
				|  |  | +                            .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
 | 
	
		
			
				|  |  | +                            .first()
 | 
	
		
			
				|  |  | +                        )
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -                    # raise error if file not found
 | 
	
		
			
				|  |  | -                    if not file:
 | 
	
		
			
				|  |  | -                        raise FileNotExistsError()
 | 
	
		
			
				|  |  | +                        # raise error if file not found
 | 
	
		
			
				|  |  | +                        if not file:
 | 
	
		
			
				|  |  | +                            raise FileNotExistsError()
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -                    file_name = file.name
 | 
	
		
			
				|  |  | -                    data_source_info = {
 | 
	
		
			
				|  |  | -                        "upload_file_id": file_id,
 | 
	
		
			
				|  |  | -                    }
 | 
	
		
			
				|  |  | -                    # check duplicate
 | 
	
		
			
				|  |  | -                    if document_data.get("duplicate", False):
 | 
	
		
			
				|  |  | -                        document = Document.query.filter_by(
 | 
	
		
			
				|  |  | -                            dataset_id=dataset.id,
 | 
	
		
			
				|  |  | -                            tenant_id=current_user.current_tenant_id,
 | 
	
		
			
				|  |  | -                            data_source_type="upload_file",
 | 
	
		
			
				|  |  | -                            enabled=True,
 | 
	
		
			
				|  |  | -                            name=file_name,
 | 
	
		
			
				|  |  | -                        ).first()
 | 
	
		
			
				|  |  | -                        if document:
 | 
	
		
			
				|  |  | -                            document.dataset_process_rule_id = dataset_process_rule.id
 | 
	
		
			
				|  |  | -                            document.updated_at = datetime.datetime.utcnow()
 | 
	
		
			
				|  |  | -                            document.created_from = created_from
 | 
	
		
			
				|  |  | -                            document.doc_form = document_data["doc_form"]
 | 
	
		
			
				|  |  | -                            document.doc_language = document_data["doc_language"]
 | 
	
		
			
				|  |  | -                            document.data_source_info = json.dumps(data_source_info)
 | 
	
		
			
				|  |  | -                            document.batch = batch
 | 
	
		
			
				|  |  | -                            document.indexing_status = "waiting"
 | 
	
		
			
				|  |  | -                            db.session.add(document)
 | 
	
		
			
				|  |  | -                            documents.append(document)
 | 
	
		
			
				|  |  | -                            duplicate_document_ids.append(document.id)
 | 
	
		
			
				|  |  | -                            continue
 | 
	
		
			
				|  |  | -                    document = DocumentService.build_document(
 | 
	
		
			
				|  |  | -                        dataset,
 | 
	
		
			
				|  |  | -                        dataset_process_rule.id,
 | 
	
		
			
				|  |  | -                        document_data["data_source"]["type"],
 | 
	
		
			
				|  |  | -                        document_data["doc_form"],
 | 
	
		
			
				|  |  | -                        document_data["doc_language"],
 | 
	
		
			
				|  |  | -                        data_source_info,
 | 
	
		
			
				|  |  | -                        created_from,
 | 
	
		
			
				|  |  | -                        position,
 | 
	
		
			
				|  |  | -                        account,
 | 
	
		
			
				|  |  | -                        file_name,
 | 
	
		
			
				|  |  | -                        batch,
 | 
	
		
			
				|  |  | -                    )
 | 
	
		
			
				|  |  | -                    db.session.add(document)
 | 
	
		
			
				|  |  | -                    db.session.flush()
 | 
	
		
			
				|  |  | -                    document_ids.append(document.id)
 | 
	
		
			
				|  |  | -                    documents.append(document)
 | 
	
		
			
				|  |  | -                    position += 1
 | 
	
		
			
				|  |  | -            elif document_data["data_source"]["type"] == "notion_import":
 | 
	
		
			
				|  |  | -                notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"]
 | 
	
		
			
				|  |  | -                exist_page_ids = []
 | 
	
		
			
				|  |  | -                exist_document = {}
 | 
	
		
			
				|  |  | -                documents = Document.query.filter_by(
 | 
	
		
			
				|  |  | -                    dataset_id=dataset.id,
 | 
	
		
			
				|  |  | -                    tenant_id=current_user.current_tenant_id,
 | 
	
		
			
				|  |  | -                    data_source_type="notion_import",
 | 
	
		
			
				|  |  | -                    enabled=True,
 | 
	
		
			
				|  |  | -                ).all()
 | 
	
		
			
				|  |  | -                if documents:
 | 
	
		
			
				|  |  | -                    for document in documents:
 | 
	
		
			
				|  |  | -                        data_source_info = json.loads(document.data_source_info)
 | 
	
		
			
				|  |  | -                        exist_page_ids.append(data_source_info["notion_page_id"])
 | 
	
		
			
				|  |  | -                        exist_document[data_source_info["notion_page_id"]] = document.id
 | 
	
		
			
				|  |  | -                for notion_info in notion_info_list:
 | 
	
		
			
				|  |  | -                    workspace_id = notion_info["workspace_id"]
 | 
	
		
			
				|  |  | -                    data_source_binding = DataSourceOauthBinding.query.filter(
 | 
	
		
			
				|  |  | -                        db.and_(
 | 
	
		
			
				|  |  | -                            DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
 | 
	
		
			
				|  |  | -                            DataSourceOauthBinding.provider == "notion",
 | 
	
		
			
				|  |  | -                            DataSourceOauthBinding.disabled == False,
 | 
	
		
			
				|  |  | -                            DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
 | 
	
		
			
				|  |  | +                        file_name = file.name
 | 
	
		
			
				|  |  | +                        data_source_info = {
 | 
	
		
			
				|  |  | +                            "upload_file_id": file_id,
 | 
	
		
			
				|  |  | +                        }
 | 
	
		
			
				|  |  | +                        # check duplicate
 | 
	
		
			
				|  |  | +                        if document_data.get("duplicate", False):
 | 
	
		
			
				|  |  | +                            document = Document.query.filter_by(
 | 
	
		
			
				|  |  | +                                dataset_id=dataset.id,
 | 
	
		
			
				|  |  | +                                tenant_id=current_user.current_tenant_id,
 | 
	
		
			
				|  |  | +                                data_source_type="upload_file",
 | 
	
		
			
				|  |  | +                                enabled=True,
 | 
	
		
			
				|  |  | +                                name=file_name,
 | 
	
		
			
				|  |  | +                            ).first()
 | 
	
		
			
				|  |  | +                            if document:
 | 
	
		
			
				|  |  | +                                document.dataset_process_rule_id = dataset_process_rule.id
 | 
	
		
			
				|  |  | +                                document.updated_at = datetime.datetime.utcnow()
 | 
	
		
			
				|  |  | +                                document.created_from = created_from
 | 
	
		
			
				|  |  | +                                document.doc_form = document_data["doc_form"]
 | 
	
		
			
				|  |  | +                                document.doc_language = document_data["doc_language"]
 | 
	
		
			
				|  |  | +                                document.data_source_info = json.dumps(data_source_info)
 | 
	
		
			
				|  |  | +                                document.batch = batch
 | 
	
		
			
				|  |  | +                                document.indexing_status = "waiting"
 | 
	
		
			
				|  |  | +                                db.session.add(document)
 | 
	
		
			
				|  |  | +                                documents.append(document)
 | 
	
		
			
				|  |  | +                                duplicate_document_ids.append(document.id)
 | 
	
		
			
				|  |  | +                                continue
 | 
	
		
			
				|  |  | +                        document = DocumentService.build_document(
 | 
	
		
			
				|  |  | +                            dataset,
 | 
	
		
			
				|  |  | +                            dataset_process_rule.id,
 | 
	
		
			
				|  |  | +                            document_data["data_source"]["type"],
 | 
	
		
			
				|  |  | +                            document_data["doc_form"],
 | 
	
		
			
				|  |  | +                            document_data["doc_language"],
 | 
	
		
			
				|  |  | +                            data_source_info,
 | 
	
		
			
				|  |  | +                            created_from,
 | 
	
		
			
				|  |  | +                            position,
 | 
	
		
			
				|  |  | +                            account,
 | 
	
		
			
				|  |  | +                            file_name,
 | 
	
		
			
				|  |  | +                            batch,
 | 
	
		
			
				|  |  |                          )
 | 
	
		
			
				|  |  | -                    ).first()
 | 
	
		
			
				|  |  | -                    if not data_source_binding:
 | 
	
		
			
				|  |  | -                        raise ValueError("Data source binding not found.")
 | 
	
		
			
				|  |  | -                    for page in notion_info["pages"]:
 | 
	
		
			
				|  |  | -                        if page["page_id"] not in exist_page_ids:
 | 
	
		
			
				|  |  | -                            data_source_info = {
 | 
	
		
			
				|  |  | -                                "notion_workspace_id": workspace_id,
 | 
	
		
			
				|  |  | -                                "notion_page_id": page["page_id"],
 | 
	
		
			
				|  |  | -                                "notion_page_icon": page["page_icon"],
 | 
	
		
			
				|  |  | -                                "type": page["type"],
 | 
	
		
			
				|  |  | -                            }
 | 
	
		
			
				|  |  | -                            document = DocumentService.build_document(
 | 
	
		
			
				|  |  | -                                dataset,
 | 
	
		
			
				|  |  | -                                dataset_process_rule.id,
 | 
	
		
			
				|  |  | -                                document_data["data_source"]["type"],
 | 
	
		
			
				|  |  | -                                document_data["doc_form"],
 | 
	
		
			
				|  |  | -                                document_data["doc_language"],
 | 
	
		
			
				|  |  | -                                data_source_info,
 | 
	
		
			
				|  |  | -                                created_from,
 | 
	
		
			
				|  |  | -                                position,
 | 
	
		
			
				|  |  | -                                account,
 | 
	
		
			
				|  |  | -                                page["page_name"],
 | 
	
		
			
				|  |  | -                                batch,
 | 
	
		
			
				|  |  | +                        db.session.add(document)
 | 
	
		
			
				|  |  | +                        db.session.flush()
 | 
	
		
			
				|  |  | +                        document_ids.append(document.id)
 | 
	
		
			
				|  |  | +                        documents.append(document)
 | 
	
		
			
				|  |  | +                        position += 1
 | 
	
		
			
				|  |  | +                elif document_data["data_source"]["type"] == "notion_import":
 | 
	
		
			
				|  |  | +                    notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"]
 | 
	
		
			
				|  |  | +                    exist_page_ids = []
 | 
	
		
			
				|  |  | +                    exist_document = {}
 | 
	
		
			
				|  |  | +                    documents = Document.query.filter_by(
 | 
	
		
			
				|  |  | +                        dataset_id=dataset.id,
 | 
	
		
			
				|  |  | +                        tenant_id=current_user.current_tenant_id,
 | 
	
		
			
				|  |  | +                        data_source_type="notion_import",
 | 
	
		
			
				|  |  | +                        enabled=True,
 | 
	
		
			
				|  |  | +                    ).all()
 | 
	
		
			
				|  |  | +                    if documents:
 | 
	
		
			
				|  |  | +                        for document in documents:
 | 
	
		
			
				|  |  | +                            data_source_info = json.loads(document.data_source_info)
 | 
	
		
			
				|  |  | +                            exist_page_ids.append(data_source_info["notion_page_id"])
 | 
	
		
			
				|  |  | +                            exist_document[data_source_info["notion_page_id"]] = document.id
 | 
	
		
			
				|  |  | +                    for notion_info in notion_info_list:
 | 
	
		
			
				|  |  | +                        workspace_id = notion_info["workspace_id"]
 | 
	
		
			
				|  |  | +                        data_source_binding = DataSourceOauthBinding.query.filter(
 | 
	
		
			
				|  |  | +                            db.and_(
 | 
	
		
			
				|  |  | +                                DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
 | 
	
		
			
				|  |  | +                                DataSourceOauthBinding.provider == "notion",
 | 
	
		
			
				|  |  | +                                DataSourceOauthBinding.disabled == False,
 | 
	
		
			
				|  |  | +                                DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
 | 
	
		
			
				|  |  |                              )
 | 
	
		
			
				|  |  | -                            db.session.add(document)
 | 
	
		
			
				|  |  | -                            db.session.flush()
 | 
	
		
			
				|  |  | -                            document_ids.append(document.id)
 | 
	
		
			
				|  |  | -                            documents.append(document)
 | 
	
		
			
				|  |  | -                            position += 1
 | 
	
		
			
				|  |  | +                        ).first()
 | 
	
		
			
				|  |  | +                        if not data_source_binding:
 | 
	
		
			
				|  |  | +                            raise ValueError("Data source binding not found.")
 | 
	
		
			
				|  |  | +                        for page in notion_info["pages"]:
 | 
	
		
			
				|  |  | +                            if page["page_id"] not in exist_page_ids:
 | 
	
		
			
				|  |  | +                                data_source_info = {
 | 
	
		
			
				|  |  | +                                    "notion_workspace_id": workspace_id,
 | 
	
		
			
				|  |  | +                                    "notion_page_id": page["page_id"],
 | 
	
		
			
				|  |  | +                                    "notion_page_icon": page["page_icon"],
 | 
	
		
			
				|  |  | +                                    "type": page["type"],
 | 
	
		
			
				|  |  | +                                }
 | 
	
		
			
				|  |  | +                                document = DocumentService.build_document(
 | 
	
		
			
				|  |  | +                                    dataset,
 | 
	
		
			
				|  |  | +                                    dataset_process_rule.id,
 | 
	
		
			
				|  |  | +                                    document_data["data_source"]["type"],
 | 
	
		
			
				|  |  | +                                    document_data["doc_form"],
 | 
	
		
			
				|  |  | +                                    document_data["doc_language"],
 | 
	
		
			
				|  |  | +                                    data_source_info,
 | 
	
		
			
				|  |  | +                                    created_from,
 | 
	
		
			
				|  |  | +                                    position,
 | 
	
		
			
				|  |  | +                                    account,
 | 
	
		
			
				|  |  | +                                    page["page_name"],
 | 
	
		
			
				|  |  | +                                    batch,
 | 
	
		
			
				|  |  | +                                )
 | 
	
		
			
				|  |  | +                                db.session.add(document)
 | 
	
		
			
				|  |  | +                                db.session.flush()
 | 
	
		
			
				|  |  | +                                document_ids.append(document.id)
 | 
	
		
			
				|  |  | +                                documents.append(document)
 | 
	
		
			
				|  |  | +                                position += 1
 | 
	
		
			
				|  |  | +                            else:
 | 
	
		
			
				|  |  | +                                exist_document.pop(page["page_id"])
 | 
	
		
			
				|  |  | +                    # delete not selected documents
 | 
	
		
			
				|  |  | +                    if len(exist_document) > 0:
 | 
	
		
			
				|  |  | +                        clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
 | 
	
		
			
				|  |  | +                elif document_data["data_source"]["type"] == "website_crawl":
 | 
	
		
			
				|  |  | +                    website_info = document_data["data_source"]["info_list"]["website_info_list"]
 | 
	
		
			
				|  |  | +                    urls = website_info["urls"]
 | 
	
		
			
				|  |  | +                    for url in urls:
 | 
	
		
			
				|  |  | +                        data_source_info = {
 | 
	
		
			
				|  |  | +                            "url": url,
 | 
	
		
			
				|  |  | +                            "provider": website_info["provider"],
 | 
	
		
			
				|  |  | +                            "job_id": website_info["job_id"],
 | 
	
		
			
				|  |  | +                            "only_main_content": website_info.get("only_main_content", False),
 | 
	
		
			
				|  |  | +                            "mode": "crawl",
 | 
	
		
			
				|  |  | +                        }
 | 
	
		
			
				|  |  | +                        if len(url) > 255:
 | 
	
		
			
				|  |  | +                            document_name = url[:200] + "..."
 | 
	
		
			
				|  |  |                          else:
 | 
	
		
			
				|  |  | -                            exist_document.pop(page["page_id"])
 | 
	
		
			
				|  |  | -                # delete not selected documents
 | 
	
		
			
				|  |  | -                if len(exist_document) > 0:
 | 
	
		
			
				|  |  | -                    clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
 | 
	
		
			
				|  |  | -            elif document_data["data_source"]["type"] == "website_crawl":
 | 
	
		
			
				|  |  | -                website_info = document_data["data_source"]["info_list"]["website_info_list"]
 | 
	
		
			
				|  |  | -                urls = website_info["urls"]
 | 
	
		
			
				|  |  | -                for url in urls:
 | 
	
		
			
				|  |  | -                    data_source_info = {
 | 
	
		
			
				|  |  | -                        "url": url,
 | 
	
		
			
				|  |  | -                        "provider": website_info["provider"],
 | 
	
		
			
				|  |  | -                        "job_id": website_info["job_id"],
 | 
	
		
			
				|  |  | -                        "only_main_content": website_info.get("only_main_content", False),
 | 
	
		
			
				|  |  | -                        "mode": "crawl",
 | 
	
		
			
				|  |  | -                    }
 | 
	
		
			
				|  |  | -                    if len(url) > 255:
 | 
	
		
			
				|  |  | -                        document_name = url[:200] + "..."
 | 
	
		
			
				|  |  | -                    else:
 | 
	
		
			
				|  |  | -                        document_name = url
 | 
	
		
			
				|  |  | -                    document = DocumentService.build_document(
 | 
	
		
			
				|  |  | -                        dataset,
 | 
	
		
			
				|  |  | -                        dataset_process_rule.id,
 | 
	
		
			
				|  |  | -                        document_data["data_source"]["type"],
 | 
	
		
			
				|  |  | -                        document_data["doc_form"],
 | 
	
		
			
				|  |  | -                        document_data["doc_language"],
 | 
	
		
			
				|  |  | -                        data_source_info,
 | 
	
		
			
				|  |  | -                        created_from,
 | 
	
		
			
				|  |  | -                        position,
 | 
	
		
			
				|  |  | -                        account,
 | 
	
		
			
				|  |  | -                        document_name,
 | 
	
		
			
				|  |  | -                        batch,
 | 
	
		
			
				|  |  | -                    )
 | 
	
		
			
				|  |  | -                    db.session.add(document)
 | 
	
		
			
				|  |  | -                    db.session.flush()
 | 
	
		
			
				|  |  | -                    document_ids.append(document.id)
 | 
	
		
			
				|  |  | -                    documents.append(document)
 | 
	
		
			
				|  |  | -                    position += 1
 | 
	
		
			
				|  |  | -            db.session.commit()
 | 
	
		
			
				|  |  | +                            document_name = url
 | 
	
		
			
				|  |  | +                        document = DocumentService.build_document(
 | 
	
		
			
				|  |  | +                            dataset,
 | 
	
		
			
				|  |  | +                            dataset_process_rule.id,
 | 
	
		
			
				|  |  | +                            document_data["data_source"]["type"],
 | 
	
		
			
				|  |  | +                            document_data["doc_form"],
 | 
	
		
			
				|  |  | +                            document_data["doc_language"],
 | 
	
		
			
				|  |  | +                            data_source_info,
 | 
	
		
			
				|  |  | +                            created_from,
 | 
	
		
			
				|  |  | +                            position,
 | 
	
		
			
				|  |  | +                            account,
 | 
	
		
			
				|  |  | +                            document_name,
 | 
	
		
			
				|  |  | +                            batch,
 | 
	
		
			
				|  |  | +                        )
 | 
	
		
			
				|  |  | +                        db.session.add(document)
 | 
	
		
			
				|  |  | +                        db.session.flush()
 | 
	
		
			
				|  |  | +                        document_ids.append(document.id)
 | 
	
		
			
				|  |  | +                        documents.append(document)
 | 
	
		
			
				|  |  | +                        position += 1
 | 
	
		
			
				|  |  | +                db.session.commit()
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -            # trigger async task
 | 
	
		
			
				|  |  | -            if document_ids:
 | 
	
		
			
				|  |  | -                document_indexing_task.delay(dataset.id, document_ids)
 | 
	
		
			
				|  |  | -            if duplicate_document_ids:
 | 
	
		
			
				|  |  | -                duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
 | 
	
		
			
				|  |  | +                # trigger async task
 | 
	
		
			
				|  |  | +                if document_ids:
 | 
	
		
			
				|  |  | +                    document_indexing_task.delay(dataset.id, document_ids)
 | 
	
		
			
				|  |  | +                if duplicate_document_ids:
 | 
	
		
			
				|  |  | +                    duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -        return documents, batch
 | 
	
		
			
				|  |  | +            return documents, batch
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      @staticmethod
 | 
	
		
			
				|  |  |      def check_documents_upload_quota(count: int, features: FeatureModel):
 |