use-metadata.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. "use client";
  2. import { useTranslation } from "react-i18next";
  3. import dayjs from "dayjs";
  4. import { formatNumber, formatFileSize, formatTime } from '@/utils/format'
  5. import type { DocType } from '@/models/datasets'
  6. export type inputType = 'input' | 'select' | 'textarea'
  7. export type metadataType = DocType | 'originInfo' | 'technicalParameters'
  8. type MetadataMap = Record<
  9. metadataType,
  10. {
  11. text: string;
  12. allowEdit?: boolean;
  13. icon?: React.ReactNode;
  14. iconName?: string;
  15. subFieldsMap: Record<
  16. string,
  17. {
  18. label: string;
  19. inputType?: inputType;
  20. field?: string;
  21. render?: (value: any, total?: number) => React.ReactNode | string
  22. }
  23. >;
  24. }
  25. >;
  26. const fieldPrefix = "datasetDocuments.metadata.field";
  27. export const useMetadataMap = (): MetadataMap => {
  28. const { t } = useTranslation();
  29. return {
  30. book: {
  31. text: t("datasetDocuments.metadata.type.book"),
  32. iconName: "bookOpen",
  33. subFieldsMap: {
  34. title: { label: t(`${fieldPrefix}.book.title`) },
  35. language: {
  36. label: t(`${fieldPrefix}.book.language`),
  37. inputType: "select",
  38. },
  39. author: { label: t(`${fieldPrefix}.book.author`) },
  40. publisher: { label: t(`${fieldPrefix}.book.publisher`) },
  41. publication_date: { label: t(`${fieldPrefix}.book.publicationDate`) },
  42. isbn: { label: t(`${fieldPrefix}.book.ISBN`) },
  43. category: {
  44. label: t(`${fieldPrefix}.book.category`),
  45. inputType: "select",
  46. },
  47. },
  48. },
  49. web_page: {
  50. text: t("datasetDocuments.metadata.type.webPage"),
  51. iconName: "globe",
  52. subFieldsMap: {
  53. title: { label: t(`${fieldPrefix}.webPage.title`) },
  54. url: { label: t(`${fieldPrefix}.webPage.url`) },
  55. language: {
  56. label: t(`${fieldPrefix}.webPage.language`),
  57. inputType: "select",
  58. },
  59. ['author/publisher']: { label: t(`${fieldPrefix}.webPage.authorPublisher`) },
  60. publish_date: { label: t(`${fieldPrefix}.webPage.publishDate`) },
  61. ['topics/keywords']: { label: t(`${fieldPrefix}.webPage.topicsKeywords`) },
  62. description: { label: t(`${fieldPrefix}.webPage.description`) },
  63. },
  64. },
  65. paper: {
  66. text: t("datasetDocuments.metadata.type.paper"),
  67. iconName: "graduationHat",
  68. subFieldsMap: {
  69. title: { label: t(`${fieldPrefix}.paper.title`) },
  70. language: {
  71. label: t(`${fieldPrefix}.paper.language`),
  72. inputType: "select",
  73. },
  74. author: { label: t(`${fieldPrefix}.paper.author`) },
  75. publish_date: { label: t(`${fieldPrefix}.paper.publishDate`) },
  76. ['journal/conference_name']: {
  77. label: t(`${fieldPrefix}.paper.journalConferenceName`),
  78. },
  79. ['volume/issue/page_numbers']: { label: t(`${fieldPrefix}.paper.volumeIssuePage`) },
  80. doi: { label: t(`${fieldPrefix}.paper.DOI`) },
  81. ['topics/keywords']: { label: t(`${fieldPrefix}.paper.topicsKeywords`) },
  82. abstract: {
  83. label: t(`${fieldPrefix}.paper.abstract`),
  84. inputType: "textarea",
  85. },
  86. },
  87. },
  88. social_media_post: {
  89. text: t("datasetDocuments.metadata.type.socialMediaPost"),
  90. iconName: "atSign",
  91. subFieldsMap: {
  92. platform: { label: t(`${fieldPrefix}.socialMediaPost.platform`) },
  93. ['author/username']: {
  94. label: t(`${fieldPrefix}.socialMediaPost.authorUsername`),
  95. },
  96. publish_date: { label: t(`${fieldPrefix}.socialMediaPost.publishDate`) },
  97. post_url: { label: t(`${fieldPrefix}.socialMediaPost.postURL`) },
  98. ['topics/tags']: { label: t(`${fieldPrefix}.socialMediaPost.topicsTags`) },
  99. },
  100. },
  101. personal_document: {
  102. text: t("datasetDocuments.metadata.type.personalDocument"),
  103. iconName: "file",
  104. subFieldsMap: {
  105. title: { label: t(`${fieldPrefix}.personalDocument.title`) },
  106. author: { label: t(`${fieldPrefix}.personalDocument.author`) },
  107. creation_date: {
  108. label: t(`${fieldPrefix}.personalDocument.creationDate`),
  109. },
  110. last_modified_date: {
  111. label: t(`${fieldPrefix}.personalDocument.lastModifiedDate`),
  112. },
  113. document_type: {
  114. label: t(`${fieldPrefix}.personalDocument.documentType`),
  115. inputType: "select",
  116. },
  117. ['tags/category']: {
  118. label: t(`${fieldPrefix}.personalDocument.tagsCategory`),
  119. },
  120. },
  121. },
  122. business_document: {
  123. text: t("datasetDocuments.metadata.type.businessDocument"),
  124. iconName: "briefcase",
  125. subFieldsMap: {
  126. title: { label: t(`${fieldPrefix}.businessDocument.title`) },
  127. author: { label: t(`${fieldPrefix}.businessDocument.author`) },
  128. creation_date: {
  129. label: t(`${fieldPrefix}.businessDocument.creationDate`),
  130. },
  131. last_modified_date: {
  132. label: t(`${fieldPrefix}.businessDocument.lastModifiedDate`),
  133. },
  134. document_type: {
  135. label: t(`${fieldPrefix}.businessDocument.documentType`),
  136. inputType: "select",
  137. },
  138. ['department/team']: {
  139. label: t(`${fieldPrefix}.businessDocument.departmentTeam`),
  140. },
  141. },
  142. },
  143. im_chat_log: {
  144. text: t("datasetDocuments.metadata.type.IMChat"),
  145. iconName: "messageTextCircle",
  146. subFieldsMap: {
  147. chat_platform: { label: t(`${fieldPrefix}.IMChat.chatPlatform`) },
  148. ['chat_participants/group_name']: {
  149. label: t(`${fieldPrefix}.IMChat.chatPartiesGroupName`),
  150. },
  151. start_date: { label: t(`${fieldPrefix}.IMChat.startDate`) },
  152. end_date: { label: t(`${fieldPrefix}.IMChat.endDate`) },
  153. participants: { label: t(`${fieldPrefix}.IMChat.participants`) },
  154. topicsKeywords: {
  155. label: t(`${fieldPrefix}.IMChat.topicsKeywords`),
  156. inputType: "textarea",
  157. },
  158. fileType: { label: t(`${fieldPrefix}.IMChat.fileType`) },
  159. },
  160. },
  161. wikipedia_entry: {
  162. text: t("datasetDocuments.metadata.type.wikipediaEntry"),
  163. allowEdit: false,
  164. subFieldsMap: {
  165. title: { label: t(`${fieldPrefix}.wikipediaEntry.title`) },
  166. language: {
  167. label: t(`${fieldPrefix}.wikipediaEntry.language`),
  168. inputType: "select",
  169. },
  170. web_page_url: { label: t(`${fieldPrefix}.wikipediaEntry.webpageURL`) },
  171. ['editor/contributor']: {
  172. label: t(`${fieldPrefix}.wikipediaEntry.editorContributor`),
  173. },
  174. last_edit_date: {
  175. label: t(`${fieldPrefix}.wikipediaEntry.lastEditDate`),
  176. },
  177. ['summary/introduction']: {
  178. label: t(`${fieldPrefix}.wikipediaEntry.summaryIntroduction`),
  179. inputType: "textarea",
  180. },
  181. },
  182. },
  183. synced_from_notion: {
  184. text: t("datasetDocuments.metadata.type.notion"),
  185. allowEdit: false,
  186. subFieldsMap: {
  187. title: { label: t(`${fieldPrefix}.notion.title`) },
  188. language: { label: t(`${fieldPrefix}.notion.lang`), inputType: "select" },
  189. ['author/creator']: { label: t(`${fieldPrefix}.notion.author`) },
  190. creation_date: { label: t(`${fieldPrefix}.notion.createdTime`) },
  191. last_modified_date: {
  192. label: t(`${fieldPrefix}.notion.lastModifiedTime`),
  193. },
  194. notion_page_link: { label: t(`${fieldPrefix}.notion.url`) },
  195. ['category/tags']: { label: t(`${fieldPrefix}.notion.tag`) },
  196. description: { label: t(`${fieldPrefix}.notion.desc`) },
  197. },
  198. },
  199. synced_from_github: {
  200. text: t("datasetDocuments.metadata.type.github"),
  201. allowEdit: false,
  202. subFieldsMap: {
  203. repository_name: { label: t(`${fieldPrefix}.github.repoName`) },
  204. repository_description: { label: t(`${fieldPrefix}.github.repoDesc`) },
  205. ['repository_owner/organization']: { label: t(`${fieldPrefix}.github.repoOwner`) },
  206. code_filename: { label: t(`${fieldPrefix}.github.fileName`) },
  207. code_file_path: { label: t(`${fieldPrefix}.github.filePath`) },
  208. programming_language: { label: t(`${fieldPrefix}.github.programmingLang`) },
  209. github_link: { label: t(`${fieldPrefix}.github.url`) },
  210. open_source_license: { label: t(`${fieldPrefix}.github.license`) },
  211. commit_date: { label: t(`${fieldPrefix}.github.lastCommitTime`) },
  212. commit_author: {
  213. label: t(`${fieldPrefix}.github.lastCommitAuthor`),
  214. },
  215. },
  216. },
  217. originInfo: {
  218. text: "",
  219. allowEdit: false,
  220. subFieldsMap: {
  221. name: { label: t(`${fieldPrefix}.originInfo.originalFilename`) },
  222. "data_source_info.upload_file.size": {
  223. label: t(`${fieldPrefix}.originInfo.originalFileSize`),
  224. render: (value) => formatFileSize(value)
  225. },
  226. created_at: {
  227. label: t(`${fieldPrefix}.originInfo.uploadDate`),
  228. render: (value) => dayjs.unix(value).format(t('datasetDocuments.metadata.dateTimeFormat') as string)
  229. },
  230. completed_at: {
  231. label: t(`${fieldPrefix}.originInfo.lastUpdateDate`),
  232. render: (value) => dayjs.unix(value).format(t('datasetDocuments.metadata.dateTimeFormat') as string)
  233. },
  234. data_source_type: {
  235. label: t(`${fieldPrefix}.originInfo.source`),
  236. render: (value) => t(`datasetDocuments.metadata.source.${value}`)
  237. },
  238. },
  239. },
  240. technicalParameters: {
  241. text: t("datasetDocuments.metadata.type.technicalParameters"),
  242. allowEdit: false,
  243. subFieldsMap: {
  244. 'dataset_process_rule.mode': {
  245. label: t(`${fieldPrefix}.technicalParameters.segmentSpecification`),
  246. render: value => value === 'automatic' ? (t('datasetDocuments.embedding.automatic') as string) : (t('datasetDocuments.embedding.custom') as string)
  247. },
  248. 'dataset_process_rule.rules.segmentation.max_tokens': {
  249. label: t(`${fieldPrefix}.technicalParameters.segmentLength`),
  250. render: value => formatNumber(value)
  251. },
  252. average_segment_length: {
  253. label: t(`${fieldPrefix}.technicalParameters.avgParagraphLength`),
  254. render: (value) => `${formatNumber(value)} characters`
  255. },
  256. segment_count: {
  257. label: t(`${fieldPrefix}.technicalParameters.paragraphs`),
  258. render: (value) => `${formatNumber(value)} paragraphs`
  259. },
  260. hit_count: {
  261. label: t(`${fieldPrefix}.technicalParameters.hitCount`),
  262. render: (value, total) => {
  263. const v = value || 0;
  264. return `${!total ? 0 : ((v / total) * 100).toFixed(2)}% (${v}/${total})`
  265. }
  266. },
  267. indexing_latency: {
  268. label: t(`${fieldPrefix}.technicalParameters.embeddingTime`),
  269. render: (value) => formatTime(value)
  270. },
  271. tokens: {
  272. label: t(`${fieldPrefix}.technicalParameters.embeddedSpend`),
  273. render: (value) => `${formatNumber(value)} tokens`
  274. },
  275. },
  276. },
  277. };
  278. };
  279. const langPrefix = "datasetDocuments.metadata.languageMap.";
  280. export const useLanguages = () => {
  281. const { t } = useTranslation();
  282. return {
  283. zh: t(langPrefix + "zh"),
  284. en: t(langPrefix + "en"),
  285. es: t(langPrefix + "es"),
  286. fr: t(langPrefix + "fr"),
  287. de: t(langPrefix + "de"),
  288. ja: t(langPrefix + "ja"),
  289. ko: t(langPrefix + "ko"),
  290. ru: t(langPrefix + "ru"),
  291. ar: t(langPrefix + "ar"),
  292. pt: t(langPrefix + "pt"),
  293. it: t(langPrefix + "it"),
  294. nl: t(langPrefix + "nl"),
  295. pl: t(langPrefix + "pl"),
  296. sv: t(langPrefix + "sv"),
  297. tr: t(langPrefix + "tr"),
  298. he: t(langPrefix + "he"),
  299. hi: t(langPrefix + "hi"),
  300. da: t(langPrefix + "da"),
  301. fi: t(langPrefix + "fi"),
  302. no: t(langPrefix + "no"),
  303. hu: t(langPrefix + "hu"),
  304. el: t(langPrefix + "el"),
  305. cs: t(langPrefix + "cs"),
  306. th: t(langPrefix + "th"),
  307. id: t(langPrefix + "id"),
  308. };
  309. };
  310. const bookCategoryPrefix = "datasetDocuments.metadata.categoryMap.book.";
  311. export const useBookCategories = () => {
  312. const { t } = useTranslation();
  313. return {
  314. fiction: t(bookCategoryPrefix + "fiction"),
  315. biography: t(bookCategoryPrefix + "biography"),
  316. history: t(bookCategoryPrefix + "history"),
  317. science: t(bookCategoryPrefix + "science"),
  318. technology: t(bookCategoryPrefix + "technology"),
  319. education: t(bookCategoryPrefix + "education"),
  320. philosophy: t(bookCategoryPrefix + "philosophy"),
  321. religion: t(bookCategoryPrefix + "religion"),
  322. socialSciences: t(bookCategoryPrefix + "socialSciences"),
  323. art: t(bookCategoryPrefix + "art"),
  324. travel: t(bookCategoryPrefix + "travel"),
  325. health: t(bookCategoryPrefix + "health"),
  326. selfHelp: t(bookCategoryPrefix + "selfHelp"),
  327. businessEconomics: t(bookCategoryPrefix + "businessEconomics"),
  328. cooking: t(bookCategoryPrefix + "cooking"),
  329. childrenYoungAdults: t(bookCategoryPrefix + "childrenYoungAdults"),
  330. comicsGraphicNovels: t(bookCategoryPrefix + "comicsGraphicNovels"),
  331. poetry: t(bookCategoryPrefix + "poetry"),
  332. drama: t(bookCategoryPrefix + "drama"),
  333. other: t(bookCategoryPrefix + "other"),
  334. };
  335. };
  336. const personalDocCategoryPrefix =
  337. "datasetDocuments.metadata.categoryMap.personalDoc.";
  338. export const usePersonalDocCategories = () => {
  339. const { t } = useTranslation();
  340. return {
  341. notes: t(personalDocCategoryPrefix + "notes"),
  342. blogDraft: t(personalDocCategoryPrefix + "blogDraft"),
  343. diary: t(personalDocCategoryPrefix + "diary"),
  344. researchReport: t(personalDocCategoryPrefix + "researchReport"),
  345. bookExcerpt: t(personalDocCategoryPrefix + "bookExcerpt"),
  346. schedule: t(personalDocCategoryPrefix + "schedule"),
  347. list: t(personalDocCategoryPrefix + "list"),
  348. projectOverview: t(personalDocCategoryPrefix + "projectOverview"),
  349. photoCollection: t(personalDocCategoryPrefix + "photoCollection"),
  350. creativeWriting: t(personalDocCategoryPrefix + "creativeWriting"),
  351. codeSnippet: t(personalDocCategoryPrefix + "codeSnippet"),
  352. designDraft: t(personalDocCategoryPrefix + "designDraft"),
  353. personalResume: t(personalDocCategoryPrefix + "personalResume"),
  354. other: t(personalDocCategoryPrefix + "other"),
  355. };
  356. };
  357. const businessDocCategoryPrefix =
  358. "datasetDocuments.metadata.categoryMap.businessDoc.";
  359. export const useBusinessDocCategories = () => {
  360. const { t } = useTranslation();
  361. return {
  362. meetingMinutes: t(businessDocCategoryPrefix + "meetingMinutes"),
  363. researchReport: t(businessDocCategoryPrefix + "researchReport"),
  364. proposal: t(businessDocCategoryPrefix + "proposal"),
  365. employeeHandbook: t(businessDocCategoryPrefix + "employeeHandbook"),
  366. trainingMaterials: t(businessDocCategoryPrefix + "trainingMaterials"),
  367. requirementsDocument: t(businessDocCategoryPrefix + "requirementsDocument"),
  368. designDocument: t(businessDocCategoryPrefix + "designDocument"),
  369. productSpecification: t(businessDocCategoryPrefix + "productSpecification"),
  370. financialReport: t(businessDocCategoryPrefix + "financialReport"),
  371. marketAnalysis: t(businessDocCategoryPrefix + "marketAnalysis"),
  372. projectPlan: t(businessDocCategoryPrefix + "projectPlan"),
  373. teamStructure: t(businessDocCategoryPrefix + "teamStructure"),
  374. policiesProcedures: t(businessDocCategoryPrefix + "policiesProcedures"),
  375. contractsAgreements: t(businessDocCategoryPrefix + "contractsAgreements"),
  376. emailCorrespondence: t(businessDocCategoryPrefix + "emailCorrespondence"),
  377. other: t(businessDocCategoryPrefix + "other"),
  378. };
  379. };