index.tsx 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011
  1. 'use client'
  2. import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
  3. import { useTranslation } from 'react-i18next'
  4. import { useContext } from 'use-context-selector'
  5. import { useBoolean } from 'ahooks'
  6. import { XMarkIcon } from '@heroicons/react/20/solid'
  7. import { RocketLaunchIcon } from '@heroicons/react/24/outline'
  8. import {
  9. RiCloseLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import { groupBy } from 'lodash-es'
  13. import PreviewItem, { PreviewType } from './preview-item'
  14. import LanguageSelect from './language-select'
  15. import s from './index.module.css'
  16. import unescape from './unescape'
  17. import escape from './escape'
  18. import cn from '@/utils/classnames'
  19. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  20. import {
  21. createDocument,
  22. createFirstDocument,
  23. fetchFileIndexingEstimate as didFetchFileIndexingEstimate,
  24. fetchDefaultProcessRule,
  25. } from '@/service/datasets'
  26. import Button from '@/app/components/base/button'
  27. import Loading from '@/app/components/base/loading'
  28. import FloatRightContainer from '@/app/components/base/float-right-container'
  29. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  30. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  31. import { type RetrievalConfig } from '@/types/app'
  32. import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  33. import Toast from '@/app/components/base/toast'
  34. import { formatNumber } from '@/utils/format'
  35. import type { NotionPage } from '@/models/common'
  36. import { DataSourceProvider } from '@/models/common'
  37. import { DataSourceType, DocForm } from '@/models/datasets'
  38. import NotionIcon from '@/app/components/base/notion-icon'
  39. import Switch from '@/app/components/base/switch'
  40. import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
  41. import { useDatasetDetailContext } from '@/context/dataset-detail'
  42. import I18n from '@/context/i18n'
  43. import { IS_CE_EDITION } from '@/config'
  44. import { RETRIEVE_METHOD } from '@/types/app'
  45. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  46. import Tooltip from '@/app/components/base/tooltip'
  47. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  48. import { LanguagesSupported } from '@/i18n/language'
  49. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  50. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  51. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  52. import { Globe01 } from '@/app/components/base/icons/src/vender/line/mapsAndTravel'
  53. type ValueOf<T> = T[keyof T]
  54. type StepTwoProps = {
  55. isSetting?: boolean
  56. documentDetail?: FullDocumentDetail
  57. isAPIKeySet: boolean
  58. onSetting: () => void
  59. datasetId?: string
  60. indexingType?: ValueOf<IndexingType>
  61. dataSourceType: DataSourceType
  62. files: CustomFile[]
  63. notionPages?: NotionPage[]
  64. websitePages?: CrawlResultItem[]
  65. crawlOptions?: CrawlOptions
  66. websiteCrawlProvider?: DataSourceProvider
  67. websiteCrawlJobId?: string
  68. onStepChange?: (delta: number) => void
  69. updateIndexingTypeCache?: (type: string) => void
  70. updateResultCache?: (res: createDocumentResponse) => void
  71. onSave?: () => void
  72. onCancel?: () => void
  73. }
  74. enum SegmentType {
  75. AUTO = 'automatic',
  76. CUSTOM = 'custom',
  77. }
  78. enum IndexingType {
  79. QUALIFIED = 'high_quality',
  80. ECONOMICAL = 'economy',
  81. }
  82. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  83. const StepTwo = ({
  84. isSetting,
  85. documentDetail,
  86. isAPIKeySet,
  87. onSetting,
  88. datasetId,
  89. indexingType,
  90. dataSourceType: inCreatePageDataSourceType,
  91. files,
  92. notionPages = [],
  93. websitePages = [],
  94. crawlOptions,
  95. websiteCrawlProvider = DataSourceProvider.fireCrawl,
  96. websiteCrawlJobId = '',
  97. onStepChange,
  98. updateIndexingTypeCache,
  99. updateResultCache,
  100. onSave,
  101. onCancel,
  102. }: StepTwoProps) => {
  103. const { t } = useTranslation()
  104. const { locale } = useContext(I18n)
  105. const media = useBreakpoints()
  106. const isMobile = media === MediaType.mobile
  107. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  108. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  109. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  110. const scrollRef = useRef<HTMLDivElement>(null)
  111. const [scrolled, setScrolled] = useState(false)
  112. const previewScrollRef = useRef<HTMLDivElement>(null)
  113. const [previewScrolled, setPreviewScrolled] = useState(false)
  114. const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
  115. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  116. const setSegmentIdentifier = useCallback((value: string) => {
  117. doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
  118. }, [])
  119. const [max, setMax] = useState(4000) // default chunk length
  120. const [overlap, setOverlap] = useState(50)
  121. const [rules, setRules] = useState<PreProcessingRule[]>([])
  122. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  123. const hasSetIndexType = !!indexingType
  124. const [indexType, setIndexType] = useState<ValueOf<IndexingType>>(
  125. (indexingType
  126. || isAPIKeySet)
  127. ? IndexingType.QUALIFIED
  128. : IndexingType.ECONOMICAL,
  129. )
  130. const [docForm, setDocForm] = useState<DocForm | string>(
  131. (datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT,
  132. )
  133. const [docLanguage, setDocLanguage] = useState<string>(
  134. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
  135. )
  136. const [QATipHide, setQATipHide] = useState(false)
  137. const [previewSwitched, setPreviewSwitched] = useState(false)
  138. const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
  139. const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  140. const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
  141. const fileIndexingEstimate = (() => {
  142. return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate
  143. })()
  144. const [isCreating, setIsCreating] = useState(false)
  145. const scrollHandle = (e: Event) => {
  146. if ((e.target as HTMLDivElement).scrollTop > 0)
  147. setScrolled(true)
  148. else
  149. setScrolled(false)
  150. }
  151. const previewScrollHandle = (e: Event) => {
  152. if ((e.target as HTMLDivElement).scrollTop > 0)
  153. setPreviewScrolled(true)
  154. else
  155. setPreviewScrolled(false)
  156. }
  157. const getFileName = (name: string) => {
  158. const arr = name.split('.')
  159. return arr.slice(0, -1).join('.')
  160. }
  161. const getRuleName = (key: string) => {
  162. if (key === 'remove_extra_spaces')
  163. return t('datasetCreation.stepTwo.removeExtraSpaces')
  164. if (key === 'remove_urls_emails')
  165. return t('datasetCreation.stepTwo.removeUrlEmails')
  166. if (key === 'remove_stopwords')
  167. return t('datasetCreation.stepTwo.removeStopwords')
  168. }
  169. const ruleChangeHandle = (id: string) => {
  170. const newRules = rules.map((rule) => {
  171. if (rule.id === id) {
  172. return {
  173. id: rule.id,
  174. enabled: !rule.enabled,
  175. }
  176. }
  177. return rule
  178. })
  179. setRules(newRules)
  180. }
  181. const resetRules = () => {
  182. if (defaultConfig) {
  183. setSegmentIdentifier(defaultConfig.segmentation.separator)
  184. setMax(defaultConfig.segmentation.max_tokens)
  185. setOverlap(defaultConfig.segmentation.chunk_overlap)
  186. setRules(defaultConfig.pre_processing_rules)
  187. }
  188. }
  189. const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT) => {
  190. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  191. const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm)!)
  192. if (segmentationType === SegmentType.CUSTOM)
  193. setCustomFileIndexingEstimate(res)
  194. else
  195. setAutomaticFileIndexingEstimate(res)
  196. }
  197. const confirmChangeCustomConfig = () => {
  198. setCustomFileIndexingEstimate(null)
  199. setShowPreview()
  200. fetchFileIndexingEstimate()
  201. setPreviewSwitched(false)
  202. }
  203. const getIndexing_technique = () => indexingType || indexType
  204. const getProcessRule = () => {
  205. const processRule: ProcessRule = {
  206. rules: {} as any, // api will check this. It will be removed after api refactored.
  207. mode: segmentationType,
  208. }
  209. if (segmentationType === SegmentType.CUSTOM) {
  210. const ruleObj = {
  211. pre_processing_rules: rules,
  212. segmentation: {
  213. separator: unescape(segmentIdentifier),
  214. max_tokens: max,
  215. chunk_overlap: overlap,
  216. },
  217. }
  218. processRule.rules = ruleObj
  219. }
  220. return processRule
  221. }
  222. const getNotionInfo = () => {
  223. const workspacesMap = groupBy(notionPages, 'workspace_id')
  224. const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
  225. return {
  226. workspaceId,
  227. pages: workspacesMap[workspaceId],
  228. }
  229. })
  230. return workspaces.map((workspace) => {
  231. return {
  232. workspace_id: workspace.workspaceId,
  233. pages: workspace.pages.map((page) => {
  234. const { page_id, page_name, page_icon, type } = page
  235. return {
  236. page_id,
  237. page_name,
  238. page_icon,
  239. type,
  240. }
  241. }),
  242. }
  243. }) as NotionInfo[]
  244. }
  245. const getWebsiteInfo = () => {
  246. return {
  247. provider: websiteCrawlProvider,
  248. job_id: websiteCrawlJobId,
  249. urls: websitePages.map(page => page.source_url),
  250. only_main_content: crawlOptions?.only_main_content,
  251. }
  252. }
  253. const getFileIndexingEstimateParams = (docForm: DocForm): IndexingEstimateParams | undefined => {
  254. if (dataSourceType === DataSourceType.FILE) {
  255. return {
  256. info_list: {
  257. data_source_type: dataSourceType,
  258. file_info_list: {
  259. file_ids: files.map(file => file.id) as string[],
  260. },
  261. },
  262. indexing_technique: getIndexing_technique() as string,
  263. process_rule: getProcessRule(),
  264. doc_form: docForm,
  265. doc_language: docLanguage,
  266. dataset_id: datasetId as string,
  267. }
  268. }
  269. if (dataSourceType === DataSourceType.NOTION) {
  270. return {
  271. info_list: {
  272. data_source_type: dataSourceType,
  273. notion_info_list: getNotionInfo(),
  274. },
  275. indexing_technique: getIndexing_technique() as string,
  276. process_rule: getProcessRule(),
  277. doc_form: docForm,
  278. doc_language: docLanguage,
  279. dataset_id: datasetId as string,
  280. }
  281. }
  282. if (dataSourceType === DataSourceType.WEB) {
  283. return {
  284. info_list: {
  285. data_source_type: dataSourceType,
  286. website_info_list: getWebsiteInfo(),
  287. },
  288. indexing_technique: getIndexing_technique() as string,
  289. process_rule: getProcessRule(),
  290. doc_form: docForm,
  291. doc_language: docLanguage,
  292. dataset_id: datasetId as string,
  293. }
  294. }
  295. }
  296. const {
  297. modelList: rerankModelList,
  298. defaultModel: rerankDefaultModel,
  299. currentModel: isRerankDefaultModelValid,
  300. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  301. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  302. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  303. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  304. currentDataset?.embedding_model
  305. ? {
  306. provider: currentDataset.embedding_model_provider,
  307. model: currentDataset.embedding_model,
  308. }
  309. : {
  310. provider: defaultEmbeddingModel?.provider.provider || '',
  311. model: defaultEmbeddingModel?.model || '',
  312. },
  313. )
  314. const getCreationParams = () => {
  315. let params
  316. if (segmentationType === SegmentType.CUSTOM && overlap > max) {
  317. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  318. return
  319. }
  320. if (isSetting) {
  321. params = {
  322. original_document_id: documentDetail?.id,
  323. doc_form: docForm,
  324. doc_language: docLanguage,
  325. process_rule: getProcessRule(),
  326. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  327. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  328. embedding_model: embeddingModel.model, // Readonly
  329. embedding_model_provider: embeddingModel.provider, // Readonly
  330. } as CreateDocumentReq
  331. }
  332. else { // create
  333. const indexMethod = getIndexing_technique()
  334. if (
  335. !isReRankModelSelected({
  336. rerankDefaultModel,
  337. isRerankDefaultModelValid: !!isRerankDefaultModelValid,
  338. rerankModelList,
  339. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  340. retrievalConfig,
  341. indexMethod: indexMethod as string,
  342. })
  343. ) {
  344. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  345. return
  346. }
  347. const postRetrievalConfig = ensureRerankModelSelected({
  348. rerankDefaultModel: rerankDefaultModel!,
  349. // eslint-disable-next-line @typescript-eslint/no-use-before-define
  350. retrievalConfig,
  351. indexMethod: indexMethod as string,
  352. })
  353. params = {
  354. data_source: {
  355. type: dataSourceType,
  356. info_list: {
  357. data_source_type: dataSourceType,
  358. },
  359. },
  360. indexing_technique: getIndexing_technique(),
  361. process_rule: getProcessRule(),
  362. doc_form: docForm,
  363. doc_language: docLanguage,
  364. retrieval_model: postRetrievalConfig,
  365. embedding_model: embeddingModel.model,
  366. embedding_model_provider: embeddingModel.provider,
  367. } as CreateDocumentReq
  368. if (dataSourceType === DataSourceType.FILE) {
  369. params.data_source.info_list.file_info_list = {
  370. file_ids: files.map(file => file.id || '').filter(Boolean),
  371. }
  372. }
  373. if (dataSourceType === DataSourceType.NOTION)
  374. params.data_source.info_list.notion_info_list = getNotionInfo()
  375. if (dataSourceType === DataSourceType.WEB)
  376. params.data_source.info_list.website_info_list = getWebsiteInfo()
  377. }
  378. return params
  379. }
  380. const getRules = async () => {
  381. try {
  382. const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
  383. const separator = res.rules.segmentation.separator
  384. setSegmentIdentifier(separator)
  385. setMax(res.rules.segmentation.max_tokens)
  386. setOverlap(res.rules.segmentation.chunk_overlap)
  387. setRules(res.rules.pre_processing_rules)
  388. setDefaultConfig(res.rules)
  389. }
  390. catch (err) {
  391. console.log(err)
  392. }
  393. }
  394. const getRulesFromDetail = () => {
  395. if (documentDetail) {
  396. const rules = documentDetail.dataset_process_rule.rules
  397. const separator = rules.segmentation.separator
  398. const max = rules.segmentation.max_tokens
  399. const overlap = rules.segmentation.chunk_overlap
  400. setSegmentIdentifier(separator)
  401. setMax(max)
  402. setOverlap(overlap)
  403. setRules(rules.pre_processing_rules)
  404. setDefaultConfig(rules)
  405. }
  406. }
  407. const getDefaultMode = () => {
  408. if (documentDetail)
  409. setSegmentationType(documentDetail.dataset_process_rule.mode)
  410. }
  411. const createHandle = async () => {
  412. if (isCreating)
  413. return
  414. setIsCreating(true)
  415. try {
  416. let res
  417. const params = getCreationParams()
  418. if (!params)
  419. return false
  420. setIsCreating(true)
  421. if (!datasetId) {
  422. res = await createFirstDocument({
  423. body: params as CreateDocumentReq,
  424. })
  425. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  426. updateResultCache && updateResultCache(res)
  427. }
  428. else {
  429. res = await createDocument({
  430. datasetId,
  431. body: params as CreateDocumentReq,
  432. })
  433. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  434. updateResultCache && updateResultCache(res)
  435. }
  436. if (mutateDatasetRes)
  437. mutateDatasetRes()
  438. onStepChange && onStepChange(+1)
  439. isSetting && onSave && onSave()
  440. }
  441. catch (err) {
  442. Toast.notify({
  443. type: 'error',
  444. message: `${err}`,
  445. })
  446. }
  447. finally {
  448. setIsCreating(false)
  449. }
  450. }
  451. const handleSwitch = (state: boolean) => {
  452. if (state)
  453. setDocForm(DocForm.QA)
  454. else
  455. setDocForm(DocForm.TEXT)
  456. }
  457. const handleSelect = (language: string) => {
  458. setDocLanguage(language)
  459. }
  460. const changeToEconomicalType = () => {
  461. if (!hasSetIndexType) {
  462. setIndexType(IndexingType.ECONOMICAL)
  463. setDocForm(DocForm.TEXT)
  464. }
  465. }
  466. const previewSwitch = async () => {
  467. setPreviewSwitched(true)
  468. if (segmentationType === SegmentType.AUTO)
  469. setAutomaticFileIndexingEstimate(null)
  470. else
  471. setCustomFileIndexingEstimate(null)
  472. await fetchFileIndexingEstimate(DocForm.QA)
  473. }
  474. useEffect(() => {
  475. // fetch rules
  476. if (!isSetting) {
  477. getRules()
  478. }
  479. else {
  480. getRulesFromDetail()
  481. getDefaultMode()
  482. }
  483. }, [])
  484. useEffect(() => {
  485. scrollRef.current?.addEventListener('scroll', scrollHandle)
  486. return () => {
  487. scrollRef.current?.removeEventListener('scroll', scrollHandle)
  488. }
  489. }, [])
  490. useLayoutEffect(() => {
  491. if (showPreview) {
  492. previewScrollRef.current?.addEventListener('scroll', previewScrollHandle)
  493. return () => {
  494. previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle)
  495. }
  496. }
  497. }, [showPreview])
  498. useEffect(() => {
  499. if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
  500. setDocForm(DocForm.TEXT)
  501. }, [indexingType, docForm])
  502. useEffect(() => {
  503. // get indexing type by props
  504. if (indexingType)
  505. setIndexType(indexingType as IndexingType)
  506. else
  507. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  508. }, [isAPIKeySet, indexingType, datasetId])
  509. useEffect(() => {
  510. if (segmentationType === SegmentType.AUTO) {
  511. setAutomaticFileIndexingEstimate(null)
  512. !isMobile && setShowPreview()
  513. fetchFileIndexingEstimate()
  514. setPreviewSwitched(false)
  515. }
  516. else {
  517. hidePreview()
  518. setCustomFileIndexingEstimate(null)
  519. setPreviewSwitched(false)
  520. }
  521. }, [segmentationType, indexType])
  522. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  523. search_method: RETRIEVE_METHOD.semantic,
  524. reranking_enable: false,
  525. reranking_model: {
  526. reranking_provider_name: rerankDefaultModel?.provider.provider,
  527. reranking_model_name: rerankDefaultModel?.model,
  528. },
  529. top_k: 3,
  530. score_threshold_enabled: false,
  531. score_threshold: 0.5,
  532. } as RetrievalConfig)
  533. return (
  534. <div className='flex w-full h-full'>
  535. <div ref={scrollRef} className='relative h-full w-full overflow-y-scroll'>
  536. <div className={cn(s.pageHeader, scrolled && s.fixed, isMobile && '!px-6')}>
  537. <span>{t('datasetCreation.steps.two')}</span>
  538. {isMobile && (
  539. <Button
  540. className='border-[0.5px] !h-8 hover:outline hover:outline-[0.5px] hover:outline-gray-300 text-gray-700 font-medium bg-white shadow-[0px_1px_2px_0px_rgba(16,24,40,0.05)]'
  541. onClick={setShowPreview}
  542. >
  543. <Tooltip>
  544. <div className="flex flex-row items-center">
  545. <RocketLaunchIcon className="h-4 w-4 mr-1.5 stroke-[1.8px]" />
  546. <span className="text-[13px]">{t('datasetCreation.stepTwo.previewTitleButton')}</span>
  547. </div>
  548. </Tooltip>
  549. </Button>
  550. )}
  551. </div>
  552. <div className={cn(s.form, isMobile && '!px-4')}>
  553. <div className={s.label}>{t('datasetCreation.stepTwo.segmentation')}</div>
  554. <div className='max-w-[640px]'>
  555. <div
  556. className={cn(
  557. s.radioItem,
  558. s.segmentationItem,
  559. segmentationType === SegmentType.AUTO && s.active,
  560. )}
  561. onClick={() => setSegmentationType(SegmentType.AUTO)}
  562. >
  563. <span className={cn(s.typeIcon, s.auto)} />
  564. <span className={cn(s.radio)} />
  565. <div className={s.typeHeader}>
  566. <div className={s.title}>{t('datasetCreation.stepTwo.auto')}</div>
  567. <div className={s.tip}>{t('datasetCreation.stepTwo.autoDescription')}</div>
  568. </div>
  569. </div>
  570. <div
  571. className={cn(
  572. s.radioItem,
  573. s.segmentationItem,
  574. segmentationType === SegmentType.CUSTOM && s.active,
  575. segmentationType === SegmentType.CUSTOM && s.custom,
  576. )}
  577. onClick={() => setSegmentationType(SegmentType.CUSTOM)}
  578. >
  579. <span className={cn(s.typeIcon, s.customize)} />
  580. <span className={cn(s.radio)} />
  581. <div className={s.typeHeader}>
  582. <div className={s.title}>{t('datasetCreation.stepTwo.custom')}</div>
  583. <div className={s.tip}>{t('datasetCreation.stepTwo.customDescription')}</div>
  584. </div>
  585. {segmentationType === SegmentType.CUSTOM && (
  586. <div className={s.typeFormBody}>
  587. <div className={s.formRow}>
  588. <div className='w-full'>
  589. <div className={s.label}>
  590. {t('datasetCreation.stepTwo.separator')}
  591. <Tooltip
  592. popupContent={
  593. <div className='max-w-[200px]'>
  594. {t('datasetCreation.stepTwo.separatorTip')}
  595. </div>
  596. }
  597. />
  598. </div>
  599. <input
  600. type="text"
  601. className={s.input}
  602. placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
  603. value={segmentIdentifier}
  604. onChange={e => doSetSegmentIdentifier(e.target.value)}
  605. />
  606. </div>
  607. </div>
  608. <div className={s.formRow}>
  609. <div className='w-full'>
  610. <div className={s.label}>{t('datasetCreation.stepTwo.maxLength')}</div>
  611. <div className='relative w-full'>
  612. <input
  613. type="number"
  614. className={s.input}
  615. placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
  616. value={max}
  617. min={1}
  618. onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  619. />
  620. <div className='absolute top-2.5 right-2.5 text-text-tertiary system-sm-regular'>Tokens</div>
  621. </div>
  622. </div>
  623. </div>
  624. <div className={s.formRow}>
  625. <div className='w-full'>
  626. <div className={s.label}>
  627. {t('datasetCreation.stepTwo.overlap')}
  628. <Tooltip
  629. popupContent={
  630. <div className='max-w-[200px]'>
  631. {t('datasetCreation.stepTwo.overlapTip')}
  632. </div>
  633. }
  634. />
  635. </div>
  636. <div className='relative w-full'>
  637. <input
  638. type="number"
  639. className={s.input}
  640. placeholder={t('datasetCreation.stepTwo.overlap') || ''}
  641. value={overlap}
  642. min={1}
  643. onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
  644. />
  645. <div className='absolute top-2.5 right-2.5 text-text-tertiary system-sm-regular'>Tokens</div>
  646. </div>
  647. </div>
  648. </div>
  649. <div className={s.formRow}>
  650. <div className='w-full flex flex-col gap-1'>
  651. <div className={s.label}>{t('datasetCreation.stepTwo.rules')}</div>
  652. {rules.map(rule => (
  653. <div key={rule.id} className={s.ruleItem}>
  654. <input id={rule.id} type="checkbox" checked={rule.enabled} onChange={() => ruleChangeHandle(rule.id)} className="w-4 h-4 rounded border-gray-300 text-blue-700 focus:ring-blue-700" />
  655. <label htmlFor={rule.id} className="ml-2 text-sm font-normal cursor-pointer text-gray-800">{getRuleName(rule.id)}</label>
  656. </div>
  657. ))}
  658. </div>
  659. </div>
  660. <div className={s.formFooter}>
  661. <Button variant="primary" className={cn(s.button)} onClick={confirmChangeCustomConfig}>{t('datasetCreation.stepTwo.preview')}</Button>
  662. <Button className={cn(s.button, 'ml-2')} onClick={resetRules}>{t('datasetCreation.stepTwo.reset')}</Button>
  663. </div>
  664. </div>
  665. )}
  666. </div>
  667. </div>
  668. <div className={s.label}>{t('datasetCreation.stepTwo.indexMode')}</div>
  669. <div className='max-w-[640px]'>
  670. <div className='flex items-center gap-3 flex-wrap sm:flex-nowrap'>
  671. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  672. <div
  673. className={cn(
  674. s.radioItem,
  675. s.indexItem,
  676. !isAPIKeySet && s.disabled,
  677. !hasSetIndexType && indexType === IndexingType.QUALIFIED && s.active,
  678. hasSetIndexType && s.disabled,
  679. hasSetIndexType && '!w-full !min-h-[96px]',
  680. )}
  681. onClick={() => {
  682. if (isAPIKeySet)
  683. setIndexType(IndexingType.QUALIFIED)
  684. }}
  685. >
  686. <span className={cn(s.typeIcon, s.qualified)} />
  687. {!hasSetIndexType && <span className={cn(s.radio)} />}
  688. <div className={s.typeHeader}>
  689. <div className={s.title}>
  690. {t('datasetCreation.stepTwo.qualified')}
  691. {!hasSetIndexType && <span className={s.recommendTag}>{t('datasetCreation.stepTwo.recommend')}</span>}
  692. </div>
  693. <div className={s.tip}>{t('datasetCreation.stepTwo.qualifiedTip')}</div>
  694. </div>
  695. {!isAPIKeySet && (
  696. <div className={s.warningTip}>
  697. <span>{t('datasetCreation.stepTwo.warning')}&nbsp;</span>
  698. <span className={s.click} onClick={onSetting}>{t('datasetCreation.stepTwo.click')}</span>
  699. </div>
  700. )}
  701. </div>
  702. )}
  703. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  704. <div
  705. className={cn(
  706. s.radioItem,
  707. s.indexItem,
  708. !hasSetIndexType && indexType === IndexingType.ECONOMICAL && s.active,
  709. hasSetIndexType && s.disabled,
  710. hasSetIndexType && '!w-full !min-h-[96px]',
  711. )}
  712. onClick={changeToEconomicalType}
  713. >
  714. <span className={cn(s.typeIcon, s.economical)} />
  715. {!hasSetIndexType && <span className={cn(s.radio)} />}
  716. <div className={s.typeHeader}>
  717. <div className={s.title}>{t('datasetCreation.stepTwo.economical')}</div>
  718. <div className={s.tip}>{t('datasetCreation.stepTwo.economicalTip')}</div>
  719. </div>
  720. </div>
  721. )}
  722. </div>
  723. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  724. <div className='mt-2 text-xs text-gray-500 font-medium'>
  725. {t('datasetCreation.stepTwo.indexSettingTip')}
  726. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  727. </div>
  728. )}
  729. {IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
  730. <div className='mt-3 rounded-xl bg-gray-50 border border-gray-100'>
  731. <div className='flex justify-between items-center px-5 py-4'>
  732. <div className='flex justify-center items-center w-8 h-8 rounded-lg bg-indigo-50'>
  733. <MessageChatSquare className='w-4 h-4' />
  734. </div>
  735. <div className='grow mx-3'>
  736. <div className='mb-[2px] text-md font-medium text-gray-900'>{t('datasetCreation.stepTwo.QATitle')}</div>
  737. <div className='inline-flex items-center text-[13px] leading-[18px] text-gray-500'>
  738. <span className='pr-1'>{t('datasetCreation.stepTwo.QALanguage')}</span>
  739. <LanguageSelect currentLanguage={docLanguage} onSelect={handleSelect} />
  740. </div>
  741. </div>
  742. <div className='shrink-0'>
  743. <Switch
  744. defaultValue={docForm === DocForm.QA}
  745. onChange={handleSwitch}
  746. size='md'
  747. />
  748. </div>
  749. </div>
  750. {docForm === DocForm.QA && !QATipHide && (
  751. <div className='flex justify-between items-center px-5 py-2 bg-orange-50 border-t border-amber-100 rounded-b-xl text-[13px] leading-[18px] text-medium text-amber-500'>
  752. {t('datasetCreation.stepTwo.QATip')}
  753. <RiCloseLine className='w-4 h-4 text-gray-500 cursor-pointer' onClick={() => setQATipHide(true)} />
  754. </div>
  755. )}
  756. </div>
  757. )}
  758. {/* Embedding model */}
  759. {indexType === IndexingType.QUALIFIED && (
  760. <div className='mb-2'>
  761. <div className={cn(s.label, datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
  762. <ModelSelector
  763. readonly={!!datasetId}
  764. defaultModel={embeddingModel}
  765. modelList={embeddingModelList}
  766. onSelect={(model: DefaultModel) => {
  767. setEmbeddingModel(model)
  768. }}
  769. />
  770. {!!datasetId && (
  771. <div className='mt-2 text-xs text-gray-500 font-medium'>
  772. {t('datasetCreation.stepTwo.indexSettingTip')}
  773. <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  774. </div>
  775. )}
  776. </div>
  777. )}
  778. {/* Retrieval Method Config */}
  779. <div>
  780. {!datasetId
  781. ? (
  782. <div className={s.label}>
  783. <div className='shrink-0 mr-4'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  784. <div className='leading-[18px] text-xs font-normal text-gray-500'>
  785. <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  786. {t('datasetSettings.form.retrievalSetting.longDescription')}
  787. </div>
  788. </div>
  789. )
  790. : (
  791. <div className={cn(s.label, 'flex justify-between items-center')}>
  792. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  793. </div>
  794. )}
  795. <div className='max-w-[640px]'>
  796. {
  797. getIndexing_technique() === IndexingType.QUALIFIED
  798. ? (
  799. <RetrievalMethodConfig
  800. value={retrievalConfig}
  801. onChange={setRetrievalConfig}
  802. />
  803. )
  804. : (
  805. <EconomicalRetrievalMethodConfig
  806. value={retrievalConfig}
  807. onChange={setRetrievalConfig}
  808. />
  809. )
  810. }
  811. </div>
  812. </div>
  813. <div className={s.source}>
  814. <div className={s.sourceContent}>
  815. {dataSourceType === DataSourceType.FILE && (
  816. <>
  817. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.fileSource')}</div>
  818. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  819. <span className={cn(s.fileIcon, files.length && s[files[0].extension || ''])} />
  820. {getFileName(files[0].name || '')}
  821. {files.length > 1 && (
  822. <span className={s.sourceCount}>
  823. <span>{t('datasetCreation.stepTwo.other')}</span>
  824. <span>{files.length - 1}</span>
  825. <span>{t('datasetCreation.stepTwo.fileUnit')}</span>
  826. </span>
  827. )}
  828. </div>
  829. </>
  830. )}
  831. {dataSourceType === DataSourceType.NOTION && (
  832. <>
  833. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.notionSource')}</div>
  834. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  835. <NotionIcon
  836. className='shrink-0 mr-1'
  837. type='page'
  838. src={notionPages[0]?.page_icon}
  839. />
  840. {notionPages[0]?.page_name}
  841. {notionPages.length > 1 && (
  842. <span className={s.sourceCount}>
  843. <span>{t('datasetCreation.stepTwo.other')}</span>
  844. <span>{notionPages.length - 1}</span>
  845. <span>{t('datasetCreation.stepTwo.notionUnit')}</span>
  846. </span>
  847. )}
  848. </div>
  849. </>
  850. )}
  851. {dataSourceType === DataSourceType.WEB && (
  852. <>
  853. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.websiteSource')}</div>
  854. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  855. <Globe01 className='shrink-0 mr-1' />
  856. <span className='grow w-0 truncate'>{websitePages[0].source_url}</span>
  857. {websitePages.length > 1 && (
  858. <span className={s.sourceCount}>
  859. <span>{t('datasetCreation.stepTwo.other')}</span>
  860. <span>{websitePages.length - 1}</span>
  861. <span>{t('datasetCreation.stepTwo.webpageUnit')}</span>
  862. </span>
  863. )}
  864. </div>
  865. </>
  866. )}
  867. </div>
  868. <div className={s.divider} />
  869. <div className={s.segmentCount}>
  870. <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.estimateSegment')}</div>
  871. <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
  872. {
  873. fileIndexingEstimate
  874. ? (
  875. <div className='text-xs font-medium text-gray-800'>{formatNumber(fileIndexingEstimate.total_segments)} </div>
  876. )
  877. : (
  878. <div className={s.calculating}>{t('datasetCreation.stepTwo.calculating')}</div>
  879. )
  880. }
  881. </div>
  882. </div>
  883. </div>
  884. {!isSetting
  885. ? (
  886. <div className='flex items-center mt-8 py-2'>
  887. <Button onClick={() => onStepChange && onStepChange(-1)}>{t('datasetCreation.stepTwo.previousStep')}</Button>
  888. <div className={s.divider} />
  889. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  890. </div>
  891. )
  892. : (
  893. <div className='flex items-center mt-8 py-2'>
  894. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
  895. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  896. </div>
  897. )}
  898. </div>
  899. </div>
  900. </div>
  901. <FloatRightContainer isMobile={isMobile} isOpen={showPreview} onClose={hidePreview} footer={null}>
  902. {showPreview && <div ref={previewScrollRef} className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll border-l border-[#F2F4F7]')}>
  903. <div className={cn(s.previewHeader, previewScrolled && `${s.fixed} pb-3`)}>
  904. <div className='flex items-center justify-between px-8'>
  905. <div className='grow flex items-center'>
  906. <div>{t('datasetCreation.stepTwo.previewTitle')}</div>
  907. {docForm === DocForm.QA && !previewSwitched && (
  908. <Button className='ml-2' variant='secondary-accent' onClick={previewSwitch}>{t('datasetCreation.stepTwo.previewButton')}</Button>
  909. )}
  910. </div>
  911. <div className='flex items-center justify-center w-6 h-6 cursor-pointer' onClick={hidePreview}>
  912. <XMarkIcon className='h-4 w-4'></XMarkIcon>
  913. </div>
  914. </div>
  915. {docForm === DocForm.QA && !previewSwitched && (
  916. <div className='px-8 pr-12 text-xs text-gray-500'>
  917. <span>{t('datasetCreation.stepTwo.previewSwitchTipStart')}</span>
  918. <span className='text-amber-600'>{t('datasetCreation.stepTwo.previewSwitchTipEnd')}</span>
  919. </div>
  920. )}
  921. </div>
  922. <div className='my-4 px-8 space-y-4'>
  923. {previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && (
  924. <>
  925. {fileIndexingEstimate?.qa_preview.map((item, index) => (
  926. <PreviewItem type={PreviewType.QA} key={item.question} qa={item} index={index + 1} />
  927. ))}
  928. </>
  929. )}
  930. {(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && (
  931. <>
  932. {fileIndexingEstimate?.preview.map((item, index) => (
  933. <PreviewItem type={PreviewType.TEXT} key={item} content={item} index={index + 1} />
  934. ))}
  935. </>
  936. )}
  937. {previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && (
  938. <div className='flex items-center justify-center h-[200px]'>
  939. <Loading type='area' />
  940. </div>
  941. )}
  942. {!previewSwitched && !fileIndexingEstimate?.preview && (
  943. <div className='flex items-center justify-center h-[200px]'>
  944. <Loading type='area' />
  945. </div>
  946. )}
  947. </div>
  948. </div>}
  949. {!showPreview && (
  950. <div className={cn(s.sideTip)}>
  951. <div className={s.tipCard}>
  952. <span className={s.icon} />
  953. <div className={s.title}>{t('datasetCreation.stepTwo.sideTipTitle')}</div>
  954. <div className={s.content}>
  955. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP1')}</p>
  956. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP2')}</p>
  957. <p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP3')}</p>
  958. <p>{t('datasetCreation.stepTwo.sideTipP4')}</p>
  959. </div>
  960. </div>
  961. </div>
  962. )}
  963. </FloatRightContainer>
  964. </div>
  965. )
  966. }
  967. export default StepTwo