|  | @@ -1,5 +1,5 @@
 | 
											
												
													
														|  |  'use client'
 |  |  'use client'
 | 
											
												
													
														|  | -import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
 |  | 
 | 
											
												
													
														|  | 
 |  | +import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
 | 
											
												
													
														|  |  import { useTranslation } from 'react-i18next'
 |  |  import { useTranslation } from 'react-i18next'
 | 
											
												
													
														|  |  import { useContext } from 'use-context-selector'
 |  |  import { useContext } from 'use-context-selector'
 | 
											
												
													
														|  |  import { useBoolean } from 'ahooks'
 |  |  import { useBoolean } from 'ahooks'
 | 
											
										
											
												
													
														|  | @@ -13,6 +13,8 @@ import { groupBy } from 'lodash-es'
 | 
											
												
													
														|  |  import PreviewItem, { PreviewType } from './preview-item'
 |  |  import PreviewItem, { PreviewType } from './preview-item'
 | 
											
												
													
														|  |  import LanguageSelect from './language-select'
 |  |  import LanguageSelect from './language-select'
 | 
											
												
													
														|  |  import s from './index.module.css'
 |  |  import s from './index.module.css'
 | 
											
												
													
														|  | 
 |  | +import unescape from './unescape'
 | 
											
												
													
														|  | 
 |  | +import escape from './escape'
 | 
											
												
													
														|  |  import cn from '@/utils/classnames'
 |  |  import cn from '@/utils/classnames'
 | 
											
												
													
														|  |  import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
 |  |  import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
 | 
											
												
													
														|  |  import {
 |  |  import {
 | 
											
										
											
												
													
														|  | @@ -78,6 +80,8 @@ enum IndexingType {
 | 
											
												
													
														|  |    ECONOMICAL = 'economy',
 |  |    ECONOMICAL = 'economy',
 | 
											
												
													
														|  |  }
 |  |  }
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | 
 |  | +const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  |  const StepTwo = ({
 |  |  const StepTwo = ({
 | 
											
												
													
														|  |    isSetting,
 |  |    isSetting,
 | 
											
												
													
														|  |    documentDetail,
 |  |    documentDetail,
 | 
											
										
											
												
													
														|  | @@ -110,8 +114,11 @@ const StepTwo = ({
 | 
											
												
													
														|  |    const previewScrollRef = useRef<HTMLDivElement>(null)
 |  |    const previewScrollRef = useRef<HTMLDivElement>(null)
 | 
											
												
													
														|  |    const [previewScrolled, setPreviewScrolled] = useState(false)
 |  |    const [previewScrolled, setPreviewScrolled] = useState(false)
 | 
											
												
													
														|  |    const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
 |  |    const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
 | 
											
												
													
														|  | -  const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
 |  | 
 | 
											
												
													
														|  | -  const [max, setMax] = useState(5000) // default chunk length
 |  | 
 | 
											
												
													
														|  | 
 |  | +  const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
 | 
											
												
													
														|  | 
 |  | +  const setSegmentIdentifier = useCallback((value: string) => {
 | 
											
												
													
														|  | 
 |  | +    doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
 | 
											
												
													
														|  | 
 |  | +  }, [])
 | 
											
												
													
														|  | 
 |  | +  const [max, setMax] = useState(4000) // default chunk length
 | 
											
												
													
														|  |    const [overlap, setOverlap] = useState(50)
 |  |    const [overlap, setOverlap] = useState(50)
 | 
											
												
													
														|  |    const [rules, setRules] = useState<PreProcessingRule[]>([])
 |  |    const [rules, setRules] = useState<PreProcessingRule[]>([])
 | 
											
												
													
														|  |    const [defaultConfig, setDefaultConfig] = useState<Rules>()
 |  |    const [defaultConfig, setDefaultConfig] = useState<Rules>()
 | 
											
										
											
												
													
														|  | @@ -183,7 +190,7 @@ const StepTwo = ({
 | 
											
												
													
														|  |    }
 |  |    }
 | 
											
												
													
														|  |    const resetRules = () => {
 |  |    const resetRules = () => {
 | 
											
												
													
														|  |      if (defaultConfig) {
 |  |      if (defaultConfig) {
 | 
											
												
													
														|  | -      setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n')
 |  | 
 | 
											
												
													
														|  | 
 |  | +      setSegmentIdentifier(defaultConfig.segmentation.separator)
 | 
											
												
													
														|  |        setMax(defaultConfig.segmentation.max_tokens)
 |  |        setMax(defaultConfig.segmentation.max_tokens)
 | 
											
												
													
														|  |        setOverlap(defaultConfig.segmentation.chunk_overlap)
 |  |        setOverlap(defaultConfig.segmentation.chunk_overlap)
 | 
											
												
													
														|  |        setRules(defaultConfig.pre_processing_rules)
 |  |        setRules(defaultConfig.pre_processing_rules)
 | 
											
										
											
												
													
														|  | @@ -217,7 +224,7 @@ const StepTwo = ({
 | 
											
												
													
														|  |        const ruleObj = {
 |  |        const ruleObj = {
 | 
											
												
													
														|  |          pre_processing_rules: rules,
 |  |          pre_processing_rules: rules,
 | 
											
												
													
														|  |          segmentation: {
 |  |          segmentation: {
 | 
											
												
													
														|  | -          separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier,
 |  | 
 | 
											
												
													
														|  | 
 |  | +          separator: unescape(segmentIdentifier),
 | 
											
												
													
														|  |            max_tokens: max,
 |  |            max_tokens: max,
 | 
											
												
													
														|  |            chunk_overlap: overlap,
 |  |            chunk_overlap: overlap,
 | 
											
												
													
														|  |          },
 |  |          },
 | 
											
										
											
												
													
														|  | @@ -394,7 +401,7 @@ const StepTwo = ({
 | 
											
												
													
														|  |      try {
 |  |      try {
 | 
											
												
													
														|  |        const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
 |  |        const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
 | 
											
												
													
														|  |        const separator = res.rules.segmentation.separator
 |  |        const separator = res.rules.segmentation.separator
 | 
											
												
													
														|  | -      setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
 |  | 
 | 
											
												
													
														|  | 
 |  | +      setSegmentIdentifier(separator)
 | 
											
												
													
														|  |        setMax(res.rules.segmentation.max_tokens)
 |  |        setMax(res.rules.segmentation.max_tokens)
 | 
											
												
													
														|  |        setOverlap(res.rules.segmentation.chunk_overlap)
 |  |        setOverlap(res.rules.segmentation.chunk_overlap)
 | 
											
												
													
														|  |        setRules(res.rules.pre_processing_rules)
 |  |        setRules(res.rules.pre_processing_rules)
 | 
											
										
											
												
													
														|  | @@ -411,7 +418,7 @@ const StepTwo = ({
 | 
											
												
													
														|  |        const separator = rules.segmentation.separator
 |  |        const separator = rules.segmentation.separator
 | 
											
												
													
														|  |        const max = rules.segmentation.max_tokens
 |  |        const max = rules.segmentation.max_tokens
 | 
											
												
													
														|  |        const overlap = rules.segmentation.chunk_overlap
 |  |        const overlap = rules.segmentation.chunk_overlap
 | 
											
												
													
														|  | -      setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
 |  | 
 | 
											
												
													
														|  | 
 |  | +      setSegmentIdentifier(separator)
 | 
											
												
													
														|  |        setMax(max)
 |  |        setMax(max)
 | 
											
												
													
														|  |        setOverlap(overlap)
 |  |        setOverlap(overlap)
 | 
											
												
													
														|  |        setRules(rules.pre_processing_rules)
 |  |        setRules(rules.pre_processing_rules)
 | 
											
										
											
												
													
														|  | @@ -616,12 +623,22 @@ const StepTwo = ({
 | 
											
												
													
														|  |                  <div className={s.typeFormBody}>
 |  |                  <div className={s.typeFormBody}>
 | 
											
												
													
														|  |                    <div className={s.formRow}>
 |  |                    <div className={s.formRow}>
 | 
											
												
													
														|  |                      <div className='w-full'>
 |  |                      <div className='w-full'>
 | 
											
												
													
														|  | -                      <div className={s.label}>{t('datasetCreation.stepTwo.separator')}</div>
 |  | 
 | 
											
												
													
														|  | 
 |  | +                      <div className={s.label}>
 | 
											
												
													
														|  | 
 |  | +                        {t('datasetCreation.stepTwo.separator')}
 | 
											
												
													
														|  | 
 |  | +                        <Tooltip
 | 
											
												
													
														|  | 
 |  | +                          popupContent={
 | 
											
												
													
														|  | 
 |  | +                            <div className='max-w-[200px]'>
 | 
											
												
													
														|  | 
 |  | +                              {t('datasetCreation.stepTwo.separatorTip')}
 | 
											
												
													
														|  | 
 |  | +                            </div>
 | 
											
												
													
														|  | 
 |  | +                          }
 | 
											
												
													
														|  | 
 |  | +                        />
 | 
											
												
													
														|  | 
 |  | +                      </div>
 | 
											
												
													
														|  |                        <input
 |  |                        <input
 | 
											
												
													
														|  |                          type="text"
 |  |                          type="text"
 | 
											
												
													
														|  |                          className={s.input}
 |  |                          className={s.input}
 | 
											
												
													
														|  | -                        placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} value={segmentIdentifier}
 |  | 
 | 
											
												
													
														|  | -                        onChange={e => setSegmentIdentifier(e.target.value)}
 |  | 
 | 
											
												
													
														|  | 
 |  | +                        placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
 | 
											
												
													
														|  | 
 |  | +                        value={segmentIdentifier}
 | 
											
												
													
														|  | 
 |  | +                        onChange={e => doSetSegmentIdentifier(e.target.value)}
 | 
											
												
													
														|  |                        />
 |  |                        />
 | 
											
												
													
														|  |                      </div>
 |  |                      </div>
 | 
											
												
													
														|  |                    </div>
 |  |                    </div>
 |