document.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. from abc import ABC, abstractmethod
  2. from collections.abc import Sequence
  3. from typing import Any, Optional
  4. from pydantic import BaseModel, Field
  5. class Document(BaseModel):
  6. """Class for storing a piece of text and associated metadata."""
  7. page_content: str
  8. """Arbitrary metadata about the page content (e.g., source, relationships to other
  9. documents, etc.).
  10. """
  11. metadata: Optional[dict] = Field(default_factory=dict)
  12. class BaseDocumentTransformer(ABC):
  13. """Abstract base class for document transformation systems.
  14. A document transformation system takes a sequence of Documents and returns a
  15. sequence of transformed Documents.
  16. Example:
  17. .. code-block:: python
  18. class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
  19. embeddings: Embeddings
  20. similarity_fn: Callable = cosine_similarity
  21. similarity_threshold: float = 0.95
  22. class Config:
  23. arbitrary_types_allowed = True
  24. def transform_documents(
  25. self, documents: Sequence[Document], **kwargs: Any
  26. ) -> Sequence[Document]:
  27. stateful_documents = get_stateful_documents(documents)
  28. embedded_documents = _get_embeddings_from_stateful_docs(
  29. self.embeddings, stateful_documents
  30. )
  31. included_idxs = _filter_similar_embeddings(
  32. embedded_documents, self.similarity_fn, self.similarity_threshold
  33. )
  34. return [stateful_documents[i] for i in sorted(included_idxs)]
  35. async def atransform_documents(
  36. self, documents: Sequence[Document], **kwargs: Any
  37. ) -> Sequence[Document]:
  38. raise NotImplementedError
  39. """
  40. @abstractmethod
  41. def transform_documents(
  42. self, documents: Sequence[Document], **kwargs: Any
  43. ) -> Sequence[Document]:
  44. """Transform a list of documents.
  45. Args:
  46. documents: A sequence of Documents to be transformed.
  47. Returns:
  48. A list of transformed Documents.
  49. """
  50. @abstractmethod
  51. async def atransform_documents(
  52. self, documents: Sequence[Document], **kwargs: Any
  53. ) -> Sequence[Document]:
  54. """Asynchronously transform a list of documents.
  55. Args:
  56. documents: A sequence of Documents to be transformed.
  57. Returns:
  58. A list of transformed Documents.
  59. """