|  | @@ -5,6 +5,7 @@ import json
 | 
											
												
													
														|  |  import docx
 |  |  import docx
 | 
											
												
													
														|  |  import pandas as pd
 |  |  import pandas as pd
 | 
											
												
													
														|  |  import pypdfium2
 |  |  import pypdfium2
 | 
											
												
													
														|  | 
 |  | +import yaml
 | 
											
												
													
														|  |  from unstructured.partition.email import partition_email
 |  |  from unstructured.partition.email import partition_email
 | 
											
												
													
														|  |  from unstructured.partition.epub import partition_epub
 |  |  from unstructured.partition.epub import partition_epub
 | 
											
												
													
														|  |  from unstructured.partition.msg import partition_msg
 |  |  from unstructured.partition.msg import partition_msg
 | 
											
										
											
												
													
														|  | @@ -101,6 +102,8 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
 | 
											
												
													
														|  |              return _extract_text_from_msg(file_content)
 |  |              return _extract_text_from_msg(file_content)
 | 
											
												
													
														|  |          case "application/json":
 |  |          case "application/json":
 | 
											
												
													
														|  |              return _extract_text_from_json(file_content)
 |  |              return _extract_text_from_json(file_content)
 | 
											
												
													
														|  | 
 |  | +        case "application/x-yaml" | "text/yaml":
 | 
											
												
													
														|  | 
 |  | +            return _extract_text_from_yaml(file_content)
 | 
											
												
													
														|  |          case _:
 |  |          case _:
 | 
											
												
													
														|  |              raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
 |  |              raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
 | 
											
												
													
														|  |  
 |  |  
 | 
											
										
											
												
													
														|  | @@ -112,6 +115,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
 | 
											
												
													
														|  |              return _extract_text_from_plain_text(file_content)
 |  |              return _extract_text_from_plain_text(file_content)
 | 
											
												
													
														|  |          case ".json":
 |  |          case ".json":
 | 
											
												
													
														|  |              return _extract_text_from_json(file_content)
 |  |              return _extract_text_from_json(file_content)
 | 
											
												
													
														|  | 
 |  | +        case ".yaml" | ".yml":
 | 
											
												
													
														|  | 
 |  | +            return _extract_text_from_yaml(file_content)
 | 
											
												
													
														|  |          case ".pdf":
 |  |          case ".pdf":
 | 
											
												
													
														|  |              return _extract_text_from_pdf(file_content)
 |  |              return _extract_text_from_pdf(file_content)
 | 
											
												
													
														|  |          case ".doc" | ".docx":
 |  |          case ".doc" | ".docx":
 | 
											
										
											
												
													
														|  | @@ -149,6 +154,15 @@ def _extract_text_from_json(file_content: bytes) -> str:
 | 
											
												
													
														|  |          raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
 |  |          raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | 
 |  | +def _extract_text_from_yaml(file_content: bytes) -> str:
 | 
											
												
													
														|  | 
 |  | +    """Extract the content from yaml file"""
 | 
											
												
													
														|  | 
 |  | +    try:
 | 
											
												
													
														|  | 
 |  | +        yaml_data = yaml.safe_load_all(file_content.decode("utf-8"))
 | 
											
												
													
														|  | 
 |  | +        return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
 | 
											
												
													
														|  | 
 |  | +    except (UnicodeDecodeError, yaml.YAMLError) as e:
 | 
											
												
													
														|  | 
 |  | +        raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  |  def _extract_text_from_pdf(file_content: bytes) -> str:
 |  |  def _extract_text_from_pdf(file_content: bytes) -> str:
 | 
											
												
													
														|  |      try:
 |  |      try:
 | 
											
												
													
														|  |          pdf_file = io.BytesIO(file_content)
 |  |          pdf_file = io.BytesIO(file_content)
 |