Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 46 additions & 44 deletions unstructured/staging/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,50 +381,52 @@ def convert_to_csv(elements: Iterable[Element]) -> str:

@requires_dependencies(["pandas"])
def get_default_pandas_dtypes() -> dict[str, Any]:
return {
"text": pd.StringDtype(), # type: ignore
"type": pd.StringDtype(), # type: ignore
"element_id": pd.StringDtype(), # type: ignore
"filename": pd.StringDtype(), # Optional[str] # type: ignore
"filetype": pd.StringDtype(), # Optional[str] # type: ignore
"file_directory": pd.StringDtype(), # Optional[str] # type: ignore
"last_modified": pd.StringDtype(), # Optional[str] # type: ignore
"attached_to_filename": pd.StringDtype(), # Optional[str] # type: ignore
"parent_id": pd.StringDtype(), # Optional[str], # type: ignore
"category_depth": "Int64", # Optional[int]
"image_path": pd.StringDtype(), # Optional[str] # type: ignore
"languages": object, # Optional[list[str]]
"page_number": "Int64", # Optional[int]
"page_name": pd.StringDtype(), # Optional[str] # type: ignore
"url": pd.StringDtype(), # Optional[str] # type: ignore
"link_urls": pd.StringDtype(), # Optional[str] # type: ignore
"link_texts": object, # Optional[list[str]]
"links": object,
"sent_from": object, # Optional[list[str]],
"sent_to": object, # Optional[list[str]]
"subject": pd.StringDtype(), # Optional[str] # type: ignore
"section": pd.StringDtype(), # Optional[str] # type: ignore
"header_footer_type": pd.StringDtype(), # Optional[str] # type: ignore
"emphasized_text_contents": object, # Optional[list[str]]
"emphasized_text_tags": object, # Optional[list[str]]
"text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
"max_characters": "Int64", # Optional[int]
"is_continuation": "boolean", # Optional[bool]
"detection_class_prob": float, # Optional[float],
"sender": pd.StringDtype(), # type: ignore
"coordinates_points": object,
"coordinates_system": pd.StringDtype(), # type: ignore
"coordinates_layout_width": float,
"coordinates_layout_height": float,
"data_source_url": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_version": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_record_locator": object,
"data_source_date_created": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_date_modified": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_permissions_data": object,
"embeddings": object,
}
if not hasattr(get_default_pandas_dtypes, "_cache"):
get_default_pandas_dtypes._cache = {
"text": pd.StringDtype(), # type: ignore
"type": pd.StringDtype(), # type: ignore
"element_id": pd.StringDtype(), # type: ignore
"filename": pd.StringDtype(), # Optional[str] # type: ignore
"filetype": pd.StringDtype(), # Optional[str] # type: ignore
"file_directory": pd.StringDtype(), # Optional[str] # type: ignore
"last_modified": pd.StringDtype(), # Optional[str] # type: ignore
"attached_to_filename": pd.StringDtype(), # Optional[str] # type: ignore
"parent_id": pd.StringDtype(), # Optional[str], # type: ignore
"category_depth": "Int64", # Optional[int]
"image_path": pd.StringDtype(), # Optional[str] # type: ignore
"languages": object, # Optional[list[str]]
"page_number": "Int64", # Optional[int]
"page_name": pd.StringDtype(), # Optional[str] # type: ignore
"url": pd.StringDtype(), # Optional[str] # type: ignore
"link_urls": pd.StringDtype(), # Optional[str] # type: ignore
"link_texts": object, # Optional[list[str]]
"links": object,
"sent_from": object, # Optional[list[str]],
"sent_to": object, # Optional[list[str]]
"subject": pd.StringDtype(), # Optional[str] # type: ignore
"section": pd.StringDtype(), # Optional[str] # type: ignore
"header_footer_type": pd.StringDtype(), # Optional[str] # type: ignore
"emphasized_text_contents": object, # Optional[list[str]]
"emphasized_text_tags": object, # Optional[list[str]]
"text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
"max_characters": "Int64", # Optional[int]
"is_continuation": "boolean", # Optional[bool]
"detection_class_prob": float, # Optional[float],
"sender": pd.StringDtype(), # type: ignore
"coordinates_points": object,
"coordinates_system": pd.StringDtype(), # type: ignore
"coordinates_layout_width": float,
"coordinates_layout_height": float,
"data_source_url": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_version": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_record_locator": object,
"data_source_date_created": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_date_modified": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_permissions_data": object,
"embeddings": object,
}
return get_default_pandas_dtypes._cache.copy()


@requires_dependencies(["pandas"])
Expand Down