
    hR.                    p   S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	  SSK
Jr  SSKJr  SS	KJr  SS
KJr  \R$                  " S\R&                  5      r\R$                  " S\R*                  5      r\R$                  " S\R.                  5      r\R$                  " S\R*                  \R&                  -  5      r\R$                  " S\R*                  \R&                  -  5      r\R$                  " S\R*                  5      rSr   S         SS jjrS S!S jjrS"S#S jjr\R$                  " S\R*                  5      r S S$S jjr!   S%         S&S jjr" S'       S(S jjr#   S)         S*S jjr$   S+         S,S jjr% S-       S.S jjr&   S/         S0S jjr'S1S jr(g)2z(
Functions for dealing with markup text
    )annotationsN)Iterable)name2codepoint)MatchPattern)urljoin)
StrOrBytes)safe_url_string)
to_unicodezI&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)z<[a-zA-Z\/!].*?>z5<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']z}<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)z<meta\s[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)[^>]*?\shttp-equiv\s*=[^>]*refreshz<((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))z 	
c                V   ^^ SUU4S jjn[         R                  U[        X5      5      $ )uc  Remove entities from the given `text` by converting them to their
corresponding unicode character.

`text` can be a unicode string or a byte string encoded in the given
`encoding` (which defaults to 'utf-8').

If `keep` is passed (with a list of entity names) those entities will
be kept (they won't be removed).

It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
and named entities (such as ``&nbsp;`` or ``&gt;``).

If `remove_illegal` is ``True``, entities that can't be converted are removed.
If `remove_illegal` is ``False``, entities that can't be converted are kept "as
is". For more information see the tests.

Always returns a unicode string (with the entities removed).

>>> import w3lib.html
>>> w3lib.html.replace_entities(b'Price: &pound;100')
'Price: \xa3100'
>>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
Price: £100
>>>

c                  > U R                  5       nS nUR                  S5      (       a  [        US   S5      nOUR                  S5      (       a  [        US   S5      nOUR                  S5      (       ak  US   nUR                  5       T;   a  U R	                  S5      $ [
        R                  " U5      =(       d$    [
        R                  " UR                  5       5      nUb7   SUs=::  a  S::  a  O  O[        U45      R                  S	5      $ [        U5      $ T(       a  UR                  S
5      (       a  S$ U R	                  S5      $ ! [        [        4 a     NBf = f)Ndec
   hex   namedr         cp1252	semicolon )	groupdictgetintlowergroupr   bytesdecodechr
ValueErrorOverflowError)mgroupsnumberentity_namekeepremove_illegals       Q/root/1688_scrapy/alibaba-scraper/venv/lib/python3.13/site-packages/w3lib/html.pyconvert_entity(replace_entities.<locals>.convert_entityH   s0   ::e+FZZ+FZZ   /K  "d*wwqz!#''4 8J8J!!#9F 
6)T) &+228<<6{" $

;(?(?rOQWWQZO . s   %*E 
E EEr"   z
Match[str]returnstr)_ent_resubr   )textr&   r'   encodingr)   s    ``  r(   replace_entitiesr2   '   s(    BP P8 ;;~z$'ABB    c                P    [        [        R                  [        X5      5      5      $ N)boolr.   searchr   )r0   r1   s     r(   has_entitiesr8   g   s    z$9:;;r3   c                @    [         R                  U[        X5      5      $ )a3  Replace all markup tags found in the given `text` by the given token.
By default `token` is an empty string so it just removes all tags.

`text` can be a unicode string or a regular string encoded as `encoding`
(or ``'utf-8'`` if `encoding` is not given.)

Always returns a unicode string.

Examples:

>>> import w3lib.html
>>> w3lib.html.replace_tags('This text contains <a>some tag</a>')
'This text contains some tag'
>>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\xe7ais</b></p>', ' -- ', 'latin-1')
' -- Je ne parle pas  -- fran\xe7ais --  -- '
>>>

)_tag_rer/   r   )r0   tokenr1   s      r(   replace_tagsr<   k   s    ( ;;uj899r3   z<!--.*?(?:-->|$)c                D    [        X5      n[        R                  SU5      $ )zRemove HTML Comments.

>>> import w3lib.html
>>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
'test  whatever'
>>>

r   )r   _REMOVECOMMENTS_REr/   )r0   r1   utexts      r(   remove_commentsr@      s!     t&E!!"e,,r3   c                  ^^^ T(       a  T(       a  [        S5      eT Vs1 s H  oDR                  5       iM     snmT Vs1 s H  oDR                  5       iM     snmSUU4S jjmSU4S jjnSn[        R                  " U[        R                  [        R
                  -  5      nUR                  U[        X5      5      $ s  snf s  snf )a  Remove HTML Tags only.

`which_ones` and `keep` are both tuples, there are four cases:

==============  ============= ==========================================
``which_ones``  ``keep``      what it does
==============  ============= ==========================================
**not empty**   empty         remove all tags in ``which_ones``
empty           **not empty** remove all tags except the ones in ``keep``
empty           empty         remove all tags
**not empty**   **not empty** not allowed
==============  ============= ==========================================


Remove all tags:

>>> import w3lib.html
>>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
>>> w3lib.html.remove_tags(doc)
'This is a link: example'
>>>

Keep only some tags:

>>> w3lib.html.remove_tags(doc, keep=('div',))
'<div>This is a link: example</div>'
>>>

Remove only specific tags:

>>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
'<div><p>This is a link: example</p></div>'
>>>

You can't remove some and keep some:

>>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
Traceback (most recent call last):
    ...
ValueError: Cannot use both which_ones and keep
>>>

z#Cannot use both which_ones and keepc                F   > U R                  5       n T(       a  U T;   $ U T;  $ r5   )r   )tagr&   
which_oness    r(   will_remove remove_tags.<locals>.will_remove   s&    iik*$$$r3   c                f   > U R                  S5      nT" U5      (       a  S$ U R                  S5      $ )N   r   r   )r   )r"   rC   rE   s     r(   
remove_tagremove_tags.<locals>.remove_tag   s-    ggaj %%r51771:5r3   z</?([^ >/]+).*?>)rC   r-   r,   r6   r+   )r    r   recompileDOTALL
IGNORECASEr/   r   )	r0   rD   r&   r1   rC   rI   regexretagsrE   s	    ``     @r(   remove_tagsrQ      s    b d>??)34#))+4J#'(4CIIK4(D 6 EZZryy2==89F::j*T"<==! 5(s   CCc                   [        X5      nU(       at  SR                  U Vs/ s H  nSU SU SU S3PM     sn5      n[        R                  " U[        R                  [        R
                  -  5      nUR                  SU5      nU$ s  snf )a  Remove tags and their content.

`which_ones` is a tuple of which tags to remove including their content.
If is empty, returns the string unmodified.

>>> import w3lib.html
>>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
>>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
'<div><p> <a href="http://www.example.com">example</a></p></div>'
>>>

|<z\b.*?</z>|<z\s*/>r   )r   joinrK   rL   rM   rN   r/   )r0   rD   r1   r?   rC   tagsrP   s          r(   remove_tags_with_contentrW      s{      t&Exx
S
AcU'#c#e<
STD"))bmm";<

2u%L Ts   B	c                d    [        X5      nU H  nUR                  U[        X#5      5      nM      U$ )a  Remove escape characters.

`which_ones` is a tuple of which escape characters we want to remove.
By default removes ``\n``, ``\t``, ``\r``.

`replace_by` is the string to replace the escape characters by.
It defaults to ``''``, meaning the escape characters are removed.

)r   replace)r0   rD   
replace_byr1   r?   ecs         r(   replace_escape_charsr\      s2      t&Eb*Z"BC Lr3   c                    SS jn[        X5      nSnU" U[        5       H:  n[        U[        5      (       a  U[	        XqUS9-  nM'  XgR                  S5      -  nM<     U$ )aD  
This function receives markup as a text (always a unicode string or
a UTF-8 encoded string) and does the following:

1. removes entities (except the ones in `keep`) from any part of it
    that is not inside a CDATA
2. searches for CDATAs and extracts their text (if any) without modifying it.
3. removes the found CDATAs

c              3     #    SnUR                  U 5       H"  nUR                  S5      u  pEXU v   Uv   UnM$     XS  v   g 7f)Nr   rH   )finditerspan)txtpatternoffsetmatchmatch_smatch_es         r(   _get_fragments&unquote_markup.<locals>._get_fragments  sP     %%c*E$zz!}GW%%KF	 +
 'ls   AAr   )r&   r'   cdata_d)ra   r-   rb   zPattern[str]r,   zIterable[str | Match[str]])r   	_cdata_re
isinstancer-   r2   r   )r0   r&   r'   r1   rg   r?   ret_textfragments           r(   unquote_markuprn     sg    " t&EH"5)4h$$(N H
 y11H 5 Or3   c                    [        XS9n[        R                  U5      =n(       a,  [        [	        U5      [	        UR                  S5      US95      $ [	        U5      $ )zReturn the base url if declared in the given HTML `text`,
relative to the given base url.

If no base url is found, the given `baseurl` is returned.

)r1   rH   )r@   _baseurl_rer7   r   r
   r   )r0   baseurlr1   r?   r"   s        r(   get_base_urlrr   0  sW     !9Eu%%q%G$oaggaj8&T
 	
 7##r3   c                    [        X5      n[        XC5      n[	        [        U5      5      n[        R                  U5      =(       d    [        R                  U5      =n(       aR  [        UR                  S5      5      n[        UR                  S5      R                  S5      U5      n[        X5      nXg4$ g! [         a    [        U 5        e f = f)aD  Return the http-equiv parameter of the HTML meta element from the given
HTML text and return a tuple ``(interval, url)`` where interval is an integer
containing the delay in seconds (or zero if not present) and url is a
string with the absolute url to redirect.

If no meta redirect is found, ``(None, None)`` is returned.

r   urlz "')NN)r   UnicodeDecodeErrorprintrW   r@   r2   _meta_refresh_rer7   _meta_refresh_re2floatr   r
   stripr   )r0   rq   r1   ignore_tagsr?   r"   intervalrt   s           r(   get_meta_refreshr}   B  s    4* %U8E,U34E##E*M.?.F.Fu.MMqM(aggen226:HEg#}  ds   B7 7Cc                ,    U R                  [        5      $ )a  
Strip all leading and trailing space characters (as defined in
https://www.w3.org/TR/html5/infrastructure.html#space-character).

Such stripping is useful e.g. for processing HTML element attributes which
contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
defines them as "valid URL potentially surrounded by spaces"
or "valid non-empty URL potentially surrounded by spaces".

>>> strip_html5_whitespace(' hello\n')
'hello'
)rz   HTML5_WHITESPACE)r0   s    r(   strip_html5_whitespacer   `  s     ::&''r3   ) Tutf-8)
r0   r	   r&   Iterable[str]r'   r6   r1   r-   r,   r-   r5   )r0   r	   r1   
str | Noner,   r6   )r   N)r0   r	   r;   r-   r1   r   r,   r-   )r0   r	   r1   r   r,   r-   )r   r   N)
r0   r	   rD   r   r&   r   r1   r   r,   r-   )r   N)r0   r	   rD   r   r1   r   r,   r-   ))
	r   N)
r0   r	   rD   r   rZ   r	   r1   r   r,   r-   )r   TN)
r0   r	   r&   r   r'   r6   r1   r   r,   r-   )r   r   )r0   r	   rq   r	   r1   r-   r,   r-   )r   r   )scriptnoscript)
r0   r	   rq   r-   r1   r-   r{   r   r,   z%tuple[None, None] | tuple[float, str])r0   r-   r,   r-   ))__doc__
__future__r   rK   collections.abcr   html.entitiesr   r   r   urllib.parser   w3lib._typesr	   	w3lib.urlr
   
w3lib.utilr   rL   rN   r.   rM   r:   Irp   rw   rx   rj   r   r2   r8   r<   r>   r@   rQ   rW   r\   rn   rr   r}   r   r   r3   r(   <module>r      s   # 	 $ (    # % !
**PMM **("))
4jjQSUSWSWX:: EII  JJ LII 
 JJCRYY	 ! 
 	=C
=C
=C =C 	=C
 	=C@<:. ZZ 2BII> -  !#	D>
D>D> D> 	D>
 	D>P NR
"/@J4 !3	
  	
 	0 	%
%
% % 	%
 	%R AH$
$)$:=$$( !7	
  	
 +<(r3   