
    @'h
"                         d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZ 	 ddlmZ  G d d	e      Zy# e$ r dZY w xY w)
a  
Unstructured file reader.

A parser for unstructured text files using Unstructured.io.
Supports .csv, .tsv, .doc, .docx, .odt, .epub, .org, .rst, .rtf,
.md, .msg, .pdf, .heic, .png, .jpg, .jpeg, .tiff, .bmp, .ppt, .pptx,
.xlsx, .eml, .html, .xml, .txt, .json documents.

    N)Path)AnyDictListOptionalSetTuple)
BaseReader)DocumentNodeRelationshipTextNode)Elementc                   @    e Zd ZdZddddddedededee   dee   d	df fd
Z	e
ddedefd       Z	 	 	 	 	 	 ddee   dee   dee   dee   dee   deee      d	ee   fdZ	 ddedee   d	ee   fdZdee   dee   dee   dee   deee      d	ee   fdZ xZS )UnstructuredReaderz8General unstructured text reader for a variety of files.N)api_keyurlallowed_metadata_typesexcluded_metadata_keysargsr   r   r   r   returnc                    t        |   |  t        t        d      || _        t        |      | _        | j                  r|xs dnd| _        |xs t        t        t        t        d      f| _        |xs dh| _        y)a  
        Initialize UnstructuredReader.

        Args:
            *args (Any): Additional arguments passed to the BaseReader.
            api_key (str, optional): API key for accessing the Unstructured.io API. If provided, the reader will use the API for parsing files. Defaults to None.
            url (str, optional): URL for the Unstructured.io API. If not provided and an api_key is given, defaults to "http://localhost:8000". Ignored if api_key is not provided. Defaults to None.
            allowed_metadata_types (Optional[Tuple], optional): Tuple of types that are allowed in the metadata. Defaults to (str, int, float, type(None)).
            excluded_metadata_keys (Optional[Set], optional): Set of metadata keys to exclude from the final document. Defaults to {"orig_elements"}.

        Attributes:
            api_key (str or None): Stores the API key.
            use_api (bool): Indicates whether to use the API for parsing files, based on the presence of the api_key.
            url (str or None): URL for the Unstructured.io API if using the API.
            allowed_metadata_types (Tuple): Tuple of types that are allowed in the metadata.
            excluded_metadata_keys (Set): Set of metadata keys to exclude from the final document.
        NzUUnstructured is not installed. Please install it using 'pip install -U unstructured'.zhttp://localhost:8000orig_elements)super__init__r   ImportErrorr   booluse_apir   strintfloattyper   r   )selfr   r   r   r   r   	__class__s         q/home/kushmeetdev/Regenta/Chatbot/venv/lib/python3.12/site-packages/llama_index/readers/file/unstructured/base.pyr   zUnstructuredReader.__init__   s    2 	$?g  G}59\\311t&< '
J	A
# '=&Q@Q#    c                      | ||      S )zSet the server url and api key. )clsr   r   s      r$   from_apizUnstructuredReader.from_apiF   s     7C  r%   fileunstructured_kwargsdocument_kwargs
extra_infosplit_documentsc                     |r|j                         ni }|j                  d      |j                  d      t        d      | j                  ||      }| j	                  |||||      S )a  
        Load data using Unstructured.io.

        Depending on the configuration, if url is set or use_api is True,
        it'll parse the file using an API call, otherwise it parses it locally.
        extra_info is extended by the returned metadata if split_documents is True.

        Args:
            file (Optional[Path]): Path to the file to be loaded.
            unstructured_kwargs (Optional[Dict]): Additional arguments for unstructured partitioning.
            document_kwargs (Optional[Dict]): Additional arguments for document creation.
            extra_info (Optional[Dict]): Extra information to add to the document metadata.
            split_documents (Optional[bool]): Whether to split the documents.
            excluded_metadata_keys (Optional[List[str]]): Keys to exclude from the metadata.

        Returns:
            List[Document]: List of parsed documents.
        r*   metadata_filenamezePlease provide a 'metadata_filename' as part of the 'unstructured_kwargs' when loading a file stream.)copyget
ValueError_partition_elements_create_documents)r"   r*   r+   r,   r-   r.   r   elementss           r$   	load_datazUnstructuredReader.load_dataK   s    6 =P1668UW  ##F+7#''(;<Dw  #'":":;NPT"U%%"
 	
r%   c                     |rt        |      |d<   | j                  r(ddlm}  |d| j                  | j
                  dz   d|S ddlm}  |di |S )a:  
        Partition the elements from the file or via API.

        Args:
            file (Optional[Path]): Path to the file to be loaded.
            unstructured_kwargs (Dict): Additional arguments for unstructured partitioning.

        Returns:
            List[Element]: List of partitioned elements.
        filenamer   )partition_via_apiz/general/v0/general)r   api_url)	partitionr'   )r   r   unstructured.partition.apir:   r   r   unstructured.partition.autor<   )r"   r+   r*   r:   r<   s        r$   r4   z&UnstructuredReader._partition_elementsz   sd     .1$i
+<<D$ #88 &  >3233r%   r6   c           	          |xs i }|xs i t        |xs  j                        g }	 ddt        dt        t           dt
        t        t        f   f fd}t        |      dk(  rg S |D 	cg c]*  }	dj                  t        |	      j                               , }
}	 ||d         }|j                  dd      xs |d	   }t        dd
j                  |
      |||d|}|rg }t        |      D ]s  \  }}|j                  |      }t        d|j                    |||      ||d|}|j#                         |j$                  t&        j(                  <   |j+                  |       u |S |g}|S c c}	w )a0  
        Create documents from partitioned elements.

        Args:
            elements (List): List of partitioned elements.
            document_kwargs (Optional[Dict]): Additional arguments for document creation.
            extra_info (Optional[Dict]): Extra information to add to the document metadata.
            split_documents (Optional[bool]): Whether to split the documents.
            excluded_metadata_keys (Optional[List[str]]): Keys to exclude from the metadata.

        Returns:
            List[Document]: List of parsed documents.
        Nelementsequence_numberr   c           	         i | j                   j                         }|j                         D ci c]7  \  }}|vr.|t        |j                        r|nt        j                  |      9 }}}|||d<   |S c c}}w )NrA   )metadatato_dictitems
isinstancer   jsondumps)	r@   rA   candidate_metadatakeyvaluerC   
doc_extrasexcluded_keysr"   s	         r$   _merge_metadataz=UnstructuredReader._create_documents.<locals>._merge_metadata   s     "NG$4$4$<$<$>!M*!M #5":":"< Cm+ !%)D)DE E*+H  *.=*+Os   <A;r    	file_pathr9   z

)textr-   doc_idid_)rQ   rC   rR   rS   Nr'   )setr   r   r   r   r   r   r   lenjoinsplitr2   r   	enumerate
id_to_hashr   rQ   as_related_node_inforelationshipsr   SOURCEappend)r"   r6   r,   r-   r.   r   
doc_kwargsdocsrN   eltext_chunksrC   r9   sourcerA   r@   hash_idnoderL   rM   s   `                 @@r$   r5   z$UnstructuredReader._create_documents   s   * %*
%2
2Qd6Q6QR! @D		/7}	#s(^	" x=AI;CDRsxxB0DD"8A;/<<T2Jhz6J 
[)	

 
 D,5h,? "(!,,_=  ,WoF"	
 ! //1 ""$++ D!"   8D; Es   //E/rT   )NNNNFN)__name__
__module____qualname____doc__r   r   r   r	   r   r   classmethodr)   r   r   r   r   r   r7   r   r4   r5   __classcell__)r#   s   @r$   r   r      s   B
 2604)R)R )R 	)R
 !))R !))R 
)RV !s ! ! !  $.2*.%)*/6:-
tn-
 &d^-
 "$	-

 TN-
 "$-
 !)c 3-
 
h-
` AE4#'4/7~4	g4:Kw-K "$K TN	K
 "$K !)c 3K 
hKr%   r   )ri   rG   pathlibr   typingr   r   r   r   r   r	   llama_index.core.readers.baser
   llama_index.core.schemar   r   r   unstructured.documents.elementsr   r   r   r'   r%   r$   <module>rq      sM      8 8 4 H H7
J J	  Gs   A   A
	A
