o
    MK&hE                     @  s"  d Z ddlmZ dZdgZddlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZmZ ddlmZmZmZmZmZmZ ddlmZmZ dd	lmZmZm Z m!Z! dd
l"m#Z# eroddl$m%Z% ddlm&Z& ddl'm(Z(m)Z)m*Z* dZ+e	ee,e,f e,e,gdf Z-G dd deeZ.G dd de Z/dS )zCUse the HTMLParser library to parse HTML files that aren't too bad.    )annotationsMITHTMLParserTreeBuilder)
HTMLParser)AnyCallablecastDictIterableListOptionalTYPE_CHECKINGTupleTypeUnion)AttributeDictCDataCommentDeclarationDoctypeProcessingInstruction)EntitySubstitutionUnicodeDammit)DetectsXMLParsedAsHTMLHTMLHTMLTreeBuilderSTRICTParserRejectedMarkup)BeautifulSoup)NavigableString)	_Encoding
_Encodings
_RawMarkupzhtml.parserNc                   @  s   e Zd ZU dZded< dZded< 	 edd4ddZded< ded< ded< d5ddZd6ddZ	d7d8dd Z	d7d9d"d#Z
d:d%d&Zd;d'd(Zd;d)d*Zd:d+d,Zd:d-d.Zd:d/d0Zd:d1d2Zd3S )<BeautifulSoupHTMLParserreplacestrREPLACEignoreIGNOREon_duplicate_attributesoupr   argsr   r+   &Union[str, _DuplicateAttributeHandler]kwargsc                O  s@   || _ || _|jj| _tj| g|R i | g | _|   d S N)r,   r+   builderattribute_dict_classr   __init__already_closed_empty_element_initialize_xml_detector)selfr,   r+   r-   r/    r7   J/var/www/html/venv/lib/python3.10/site-packages/bs4/builder/_htmlparser.pyr3   T   s   
	z BeautifulSoupHTMLParser.__init__z	List[str]r4   messagereturnNonec                 C  s   t |r0   r   )r6   r9   r7   r7   r8   erroro   s   zBeautifulSoupHTMLParser.errornameattrsList[Tuple[str, Optional[str]]]c                 C  s   | j ||dd | | dS )zHandle an incoming empty-element tag.

        html.parser only calls this method when the markup looks like
        <tag/>.
        F)handle_empty_elementN)handle_starttaghandle_endtag)r6   r=   r>   r7   r7   r8   handle_startendtag   s   z*BeautifulSoupHTMLParser.handle_startendtagTr@   boolc                 C  s   |   }|D ]3\}}|du rd}||v r5| j}|| jkrq|d| jfv r)|||< qtt|}|||| q|||< q| jjjrF| 	 \}}	nd }}	| jj
|dd|||	d}
|
rj|
jrj|rj| j|dd | j| | jdu rv| | dS dS )zHandle an opening tag, e.g. '<tag>'

        :param handle_empty_element: True if this tag is known to be
            an empty-element tag (i.e. there is not expected to be any
            closing tag).
        N )
sourceline	sourceposF)check_already_closed)r2   r+   r)   r'   r   _DuplicateAttributeHandlerr,   r1   store_line_numbersgetposrA   is_empty_elementrB   r4   append_root_tag_name_root_tag_encountered)r6   r=   r>   r@   	attr_dictkeyvalueon_duperF   rG   tagr7   r7   r8   rA      s2   






z'BeautifulSoupHTMLParser.handle_starttagrH   c                 C  s.   |r|| j v r| j | dS | j| dS )zHandle a closing tag, e.g. '</tag>'

        :param name: A tag name.
        :param check_already_closed: True if this tag is expected to
           be the closing portion of an empty-element tag,
           e.g. '<tag></tag>'.
        N)r4   remover,   rB   )r6   r=   rH   r7   r7   r8   rB      s   	z%BeautifulSoupHTMLParser.handle_endtagdatac                 C  s   | j | dS )z4Handle some textual data that shows up between tags.N)r,   handle_datar6   rV   r7   r7   r8   rW      s   z#BeautifulSoupHTMLParser.handle_datac              	   C  s   | drt|dd}n| drt|dd}nt|}d}|dk rE| jjdfD ]}|s1q,z
t|g|}W q, tyD   Y q,w |sYzt|}W n t	t
fyX   Y nw |p\d}| | dS )zHandle a numeric character reference by converting it to the
        corresponding Unicode character and treating it as textual
        data.

        :param name: Character number, possibly in hexadecimal.
        x   XN   zwindows-1252u   �)
startswithintlstripr,   original_encoding	bytearraydecodeUnicodeDecodeErrorchr
ValueErrorOverflowErrorrW   )r6   r=   	real_namerV   encodingr7   r7   r8   handle_charref   s.   

z&BeautifulSoupHTMLParser.handle_charrefc                 C  s0   t j|}|dur|}nd| }| | dS )zHandle a named entity reference by converting it to the
        corresponding Unicode character(s) and treating it as textual
        data.

        :param name: Name of the entity reference.
        Nz&%s)r   HTML_ENTITY_TO_CHARACTERgetrW   )r6   r=   	characterrV   r7   r7   r8   handle_entityref
  s
   z(BeautifulSoupHTMLParser.handle_entityrefc                 C  s&   | j   | j | | j t dS )zOHandle an HTML comment.

        :param data: The text of the comment.
        N)r,   endDatarW   r   rX   r7   r7   r8   handle_comment  s   
z&BeautifulSoupHTMLParser.handle_commentc                 C  s6   | j   |tdd }| j | | j t dS )zYHandle a DOCTYPE declaration.

        :param data: The text of the declaration.
        zDOCTYPE N)r,   rn   lenrW   r   rX   r7   r7   r8   handle_decl&  s   
z#BeautifulSoupHTMLParser.handle_declc                 C  sN   |  drt}|tdd }nt}| j  | j| | j| dS )z{Handle a declaration of unknown type -- probably a CDATA block.

        :param data: The text of the declaration.
        zCDATA[N)upperr]   r   rp   r   r,   rn   rW   )r6   rV   clsr7   r7   r8   unknown_decl0  s   
z$BeautifulSoupHTMLParser.unknown_declc                 C  s0   | j   | j | | | | j t dS )z\Handle a processing instruction.

        :param data: The text of the instruction.
        N)r,   rn   rW   _document_might_be_xmlr   rX   r7   r7   r8   	handle_pi?  s   

z!BeautifulSoupHTMLParser.handle_piN)r,   r   r-   r   r+   r.   r/   r   )r9   r&   r:   r;   )r=   r&   r>   r?   r:   r;   )T)r=   r&   r>   r?   r@   rD   r:   r;   )r=   r&   rH   rD   r:   r;   )rV   r&   r:   r;   )r=   r&   r:   r;   )__name__
__module____qualname__r'   __annotations__r)   r3   r<   rC   rA   rB   rW   ri   rm   ro   rq   rt   rv   r7   r7   r7   r8   r$   =   s*   
 

>


(

	

r$   c                      s   e Zd ZU dZdZded< dZded< eZded< ee	e
gZd	ed
< ded< dZded< 		d$d% fddZ			d&d'dd Zd(d"d#Z  ZS ))r   zA Beautiful soup `bs4.builder.TreeBuilder` that uses the
    :py:class:`html.parser.HTMLParser` parser, found in the Python
    standard library.

    FrD   is_xmlT	picklabler&   NAMEzIterable[str]featuresz$Tuple[Iterable[Any], Dict[str, Any]]parser_argsTRACKS_LINE_NUMBERSNOptional[Iterable[Any]]parser_kwargsOptional[Dict[str, Any]]r/   r   c                   sp   t  }dD ]}||v r||}|||< qtt| jdi | |p#g }|p'i }|| d|d< ||f| _dS )a  Constructor.

        :param parser_args: Positional arguments to pass into
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param parser_kwargs: Keyword arguments to pass into
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param kwargs: Keyword arguments for the superclass constructor.
        r*   Fconvert_charrefsNr7   )dictpopsuperr   r3   updater   )r6   r   r   r/   extra_parser_kwargsargrR   	__class__r7   r8   r3   [  s   

zHTMLParserTreeBuilder.__init__markupr#   user_specified_encodingOptional[_Encoding]document_declared_encodingexclude_encodingsOptional[_Encodings]r:   DIterable[Tuple[str, Optional[_Encoding], Optional[_Encoding], bool]]c                 c  s    t |tr|dddfV  dS g }|r|| g }|r!|| t|||d|d}|jdu r3td|j|j|j|jfV  dS )a2  Run any preliminary steps necessary to make incoming markup
        acceptable to the parser.

        :param markup: Some markup -- probably a bytestring.
        :param user_specified_encoding: The user asked to try this encoding.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding.
        :param exclude_encodings: The user asked _not_ to try any of
            these encodings.

        :yield: A series of 4-tuples: (markup, encoding, declared encoding,
             has undergone character replacement)

            Each 4-tuple represents a strategy for parsing the document.
            This TreeBuilder uses Unicode, Dammit to convert the markup
            into Unicode, so the ``markup`` element of the tuple will
            always be a string.
        NFT)known_definite_encodingsuser_encodingsis_htmlr   zPCould not convert input to Unicode, and html.parser will not accept bytestrings.)	
isinstancer&   rM   r   unicode_markupr   r`   declared_html_encodingcontains_replacement_characters)r6   r   r   r   r   r   r   dammitr7   r7   r8   prepare_markupy  s4   




z$HTMLParserTreeBuilder.prepare_markupr;   c              
   C  s   | j \}}t|tsJ | jd usJ t| jg|R i |}z|| |  W n ty: } zt|d }~ww g |_	d S r0   )
r   r   r&   r,   r$   feedcloseAssertionErrorr   r4   )r6   r   r-   r/   parserer7   r7   r8   r     s   


zHTMLParserTreeBuilder.feed)NN)r   r   r   r   r/   r   )NNN)
r   r#   r   r   r   r   r   r   r:   r   )r   r#   r:   r;   )rw   rx   ry   __doc__r{   rz   r|   
HTMLPARSERr}   r   r   r~   r   r3   r   r   __classcell__r7   r7   r   r8   r   J  s    
 !H)0r   
__future__r   __license____all__html.parserr   typingr   r   r   r	   r
   r   r   r   r   r   r   bs4.elementr   r   r   r   r   r   
bs4.dammitr   r   bs4.builderr   r   r   r   bs4.exceptionsr   bs4r   r    bs4._typingr!   r"   r#   r   r&   rI   r$   r   r7   r7   r7   r8   <module>   s*   4   