
    @'hp                     f    d dl mZ d dlmZmZmZmZ d dlmZ d dl	m
Z
 erd dlmZ  G d de      Zy)	    )Path)TYPE_CHECKINGDictListOptional)
BaseReader)Document)Tagc                   j     e Zd ZdZ	 	 ddededdf fdZ	 ddedee	   de
e   fd	Zdd
defdZ xZS )HTMLTagReaderz
    Read HTML files and extract text from a specific tag with BeautifulSoup.

    By default, reads the text from the ``<section>`` tag.
    tagignore_no_idreturnNc                 >    || _         || _        t        |           y N)_tag_ignore_no_idsuper__init__)selfr   r   	__class__s      i/home/kushmeetdev/Regenta/Chatbot/venv/lib/python3.12/site-packages/llama_index/readers/file/html/base.pyr   zHTMLTagReader.__init__   s    
 	)    file
extra_infoc                    	 ddl m} t        |d      5 } ||d      }d d d        j	                  | j
                        }g }|D ]  }|j                  d      }	| j                  |      }
| j                  r|	s4| j
                  |	t        |      d}|j                  |xs i        t        |
|	      }|j                  |        |S # t        $ r t        d      w xY w# 1 sw Y   xY w)
Nr   )BeautifulSoup#bs4 is required to read HTML files.zutf-8)encodingzhtml.parserid)r   tag_id	file_path)textmetadata)bs4r   ImportErroropenfind_allr   get_extract_text_from_tagr   strupdater	   append)r   r   r   r   	html_filesouptagsdocsr   r!   tag_textr$   docs                r   	load_datazHTMLTagReader.load_data   s    	E) $) 	;Y M:D	; }}TYY' 	CWWT]F2237H!!& yy  YH
 OOJ,"-!C KK%	& 7  	ECDD	E	; 	;s   C
 
C"
C"C+r
   c                    	 ddl m} g }|j                  D ]  }t	        ||      r1|j                         s |j                  |j                                @|j                  | j                  k(  rZ|j                  |j                         j                                 dj                  |      S # t        $ r t        d      w xY w)Nr   )NavigableStringr   
)r%   r6   r&   children
isinstancestripr-   namer   get_textjoin)r   r   r6   textselems        r   r*   z$HTMLTagReader._extract_text_from_tag>   s    	E+ LL 	6D$0::<LL.dii'T]]_2245	6 yy  	ECDD	Es   B0 0C)sectionFr   )__name__
__module____qualname____doc__r+   boolr   r   r   r   r   r	   r4   r*   __classcell__)r   s   @r   r   r      so     "  
	 8<  &.tn 	h D %  C  r   r   N)pathlibr   typingr   r   r   r   llama_index.core.readers.baser   llama_index.core.schemar	   r%   r
   r    r   r   <module>rL      s(     6 6 4 ,B J B r   