
    !hz'                       S r SSKJr  SSKrSSKrSSKrSSKJrJr  SSK	J
r
  SSKJrJrJrJr  SSKJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJ r J!r!  SSK"J#r#J$r$  SSK%J&r'  SSK(J)r)  SSK*J+r+J,r,  \(       a  SSK-J.r.  SSK/J0r0  SSK1J2r2  \Rf                  " \45      r5Sr6\Rn                  " S5      r8SS jr9SS jr:S S jr; " S S5      r<\\=\R|                  \=   4   r?\\?\\?   4   r@ " S S5      rAg)!z#
Link extractor based on lxml.html
    )annotationsN)CallableIterable)partial)TYPE_CHECKINGAnyUnioncast)urljoinurlparse)etree)HTMLTranslator)strip_html5_whitespace)canonicalize_urlsafe_url_string)Link)IGNORED_EXTENSIONS_is_valid_url_matches)arg_to_iterrel_has_nofollow)unique)get_base_url)url_has_any_extensionurl_is_from_any_domain)HtmlElement)Selector)TextResponsezhttp://www.w3.org/1999/xhtmlzstring()c                    [        U [        5      (       a:  U S   S:X  a1  U S[        [        5      S-    [        :X  a  U R	                  S5      S   $ U $ )Nr   {   })
isinstancestrlenXHTML_NAMESPACEsplit)tags    e/root/1688_scrapy/alibaba-scraper/venv/lib/python3.13/site-packages/scrapy/linkextractors/lxmlhtml.py_nonsr+   *   sK    3FcMC(1,-@yy~b!!J    c                    U $ N )xs    r*   	_identityr1   4   s    Hr,   c                *    [        U R                  SS9$ )NT)keep_fragments)r   url)links    r*   _canonicalize_link_urlr6   8   s    DHHT::r,   c                      \ rS rSr      S
           SS jjr    SS jr          SS jrSS jrSS jrSS jr	S	r
g)LxmlParserLinkExtractor<   Nc                   [        U5      (       a  UO5[        [        [        /[        4   [        [        R                  U5      5      U l        [        U5      (       a  UO5[        [        [        /[        4   [        [        R                  U5      5      U l	        [        U5      (       a  UO[        U l        X@l        XPl        U(       a8  [        [        [        /[        4   [        R                  " S5      5      U l        g [         U l        g )Nr4   )callabler
   r   r%   boolr   operatoreqscan_tag	scan_attrr1   process_attrr   stripr   
attrgetterr6   link_key)selfr)   attrprocessr   rB   canonicalizeds          r*   __init__ LxmlParserLinkExtractor.__init__=   s     }} hud{+WX[[#-FG 	 ~~ hud{+WX[[$-GH 	  ((Gi 	 # 
  4&#+&(;(;E(BC 	 ( 	r,   c              #    #    UR                  [        R                  5       Hc  nU R                  [	        UR
                  5      5      (       d  M.  UR                  nU H#  nU R                  U5      (       d  M  X$X4   4v   M%     Me     g 7fr.   )iterr   Elementr?   r+   r)   attribr@   )rE   documentelattribsrN   s        r*   _iter_links#LxmlParserLinkExtractor._iter_links\   sj      --.B==rvv//iiG!~~f--'/11 "	 /s   BB
c           
        / nU R                  UR                  5       H  u  pgn U R                  (       a  [        U5      n[	        XH5      nU R                  U5      n	U	c  MD    [        XS9n	[	        X)5      n	[        U	[        U5      =(       d    S[        UR                  S5      5      S9n
UR                  U
5        M     U R                  U5      $ ! [         a     M  f = f! [         a    [        R                  SU	< 35         M  f = f)N)encodingz)Skipping extraction of link with bad URL  rel)nofollow)rR   rootrB   r   r   rA   
ValueErrorr   loggerdebugr   _collect_string_contentr   getappend_deduplicate_if_needed)rE   selectorresponse_urlresponse_encodingbase_urllinksrP   rF   attr_valr4   r5   s              r*   _extract_links&LxmlParserLinkExtractor._extract_linksh   s    "&"2"28=="ABh	::5h?H"86 ''1; %cF ,,C'+1r)"&&-8D
 LL3 #B4 **511)    HPQs#   'C$	C$
C! C!$#D
Dc                |    [        U5      nU R                  UR                  UR                  UR                  U5      $ r.   )r   rg   ra   r4   rU   )rE   responserd   s      r*   extract_links%LxmlParserLinkExtractor.extract_links   s8    )""x||X->->
 	
r,   c                $    U R                  U5      $ )zSNormalize and filter extracted links

The subclass should override it if necessary
)r`   rE   re   s     r*   _process_links&LxmlParserLinkExtractor._process_links   s    
 **511r,   c                N    U R                   (       a  [        XR                  S9$ U$ )Nkey)r   unique_listrD   rn   s     r*   r`   .LxmlParserLinkExtractor._deduplicate_if_needed   s    ;;u--88r,   )rD   rA   r@   r?   rB   r   )ahrefNFTF)r)   str | Callable[[str], bool]rF   rx   rG   Callable[[Any], Any] | Noner   r<   rB   r<   rH   r<   )rO   r   returnz&Iterable[tuple[HtmlElement, str, str]])
ra   r   rb   r%   rc   r%   rd   r%   rz   
list[Link]rj   r   rz   r{   re   r{   rz   r{   )__name__
__module____qualname____firstlineno__rI   rR   rg   rk   ro   r`   __static_attributes__r/   r,   r*   r8   r8   <   s     ,/,2/3#
(
 *
 -	

 
 
 
>
2#
2	/
2#2#2 #2 	#2
 #2 
#2J
2r,   r8   c                      \ rS rSr\" 5       r              S                           SS jjr\SS j5       rSS jr	SS jr
SS jrSS jrSS	 jrS
rg)LxmlLinkExtractor   Nc           
        [        [        U5      5      [        [        U5      5      pv[        [        [        R
                  U5      [        [        R
                  U5      U	U
UU(       + S9U l        U R                  U5      U l        U R                  U5      U l	        [        [        U5      5      U l
        [        [        U5      5      U l        [        [        U5      5      U l        U =R                  [        [        U R                  R                   [        U5      5      5      -  sl        Uc  ["        nXl        [        U5       Vs1 s H  nSU-   iM
     snU l        U R                  U5      U l        g s  snf )N)r)   rF   r   rG   rB   rH   .)setr   r8   r   r=   containslink_extractor_compile_regexes	allow_resdeny_resallow_domainsdeny_domainstuplerestrict_xpathsmap_csstranslatorcss_to_xpathr   canonicalizedeny_extensionsrestrict_text)rE   allowdenyr   r   r   tagsattrsr   r   process_valuer   restrict_cssrB   r   es                   r*   rI   LxmlLinkExtractor.__init__   s8   " +d+,c+e2D.Ee5))40**E2!**
 150E0Ee0L/3/D/DT/J'*;}+E'F&)+l*C&D05k/6R0S##00+l2KL!
 	
 "0O".;F;W)X;Wa#';W)X484I4I-4X *Ys   E6c                    [        U 5       Vs/ s H:  n[        U[        R                  5      (       a  UO[        R                  " U5      PM<     sn$ s  snf r.   )r   r$   rePatterncompile)valuer0   s     r*   r   "LxmlLinkExtractor._compile_regexes   sK     !'
' Arzz**A

1='
 	
 
s   AAc                   [        UR                  5      (       d  gU R                  (       a&  [        UR                  U R                  5      (       d  gU R                  (       a&  [        UR                  U R                  5      (       a  g[        UR                  5      nU R                  (       a  [        X R                  5      (       d  gU R                  (       a  [        X R                  5      (       a  gU R                  (       a  [        X R                  5      (       a  gU R                  (       + =(       d     [        UR                  U R                  5      $ )NF)r   r4   r   r   r   r   r   r   r   r   r   r   text)rE   r5   
parsed_urls      r*   _link_allowedLxmlLinkExtractor._link_allowed   s    TXX&&>>(488T^^"D"D==Xdhh>>dhh'
&<**'
 '
 !7
DUDU!V!V$9,,%
 %
 %%%P$))T=O=O)PPr,   c                  ^ U R                   (       a  [        TU R                   5      (       d  gU R                  (       a  [        TU R                  5      (       a  gU R                  (       a  U4S jU R                   5       OS/nU R                  (       a  U4S jU R                   5       O/ n[        U5      =(       a    [        U5      (       + $ )NFc              3  D   >#    U  H  oR                  T5      v   M     g 7fr.   search.0regexr4   s     r*   	<genexpr>,LxmlLinkExtractor.matches.<locals>.<genexpr>   s     ;N5\\#N    Tc              3  D   >#    U  H  oR                  T5      v   M     g 7fr.   r   r   s     r*   r   r      s     ?,,s##r   )r   r   r   r   r   any)rE   r4   alloweddenieds    `  r*   matchesLxmlLinkExtractor.matches   s    &<S$BTBT&U&U!7T=N=N!O!O ~~ <DNN; 	
 DH==??VX7|/CK/r,   c                    U Vs/ s H  o R                  U5      (       d  M  UPM     nnU R                  (       a#  U H  n[        UR                  5      Ul        M     U R                  R                  U5      $ s  snf r.   )r   r   r   r4   r   ro   )rE   re   r0   r5   s       r*   ro    LxmlLinkExtractor._process_links   s`    !;Eq%7%7%:E;+DHH5 ""11%88	 <s
   A9A9c                :    U R                   R                  " U0 UD6$ r.   )r   rg   )rE   argskwargss      r*   rg    LxmlLinkExtractor._extract_links  s    ""114B6BBr,   c                   [        U5      nU R                  (       a6  U R                   VVs/ s H  o1R                  U5        H  oDPM     M     nnnOUR                  /n/ nU HJ  nU R	                  XqR
                  UR                  U5      nUR                  U R                  U5      5        ML     U R                  R                  (       a  [        X`R                  R                  S9$ U$ s  snnf )aF  Returns a list of :class:`~scrapy.link.Link` objects from the
specified :class:`response <scrapy.http.Response>`.

Only links that match the settings passed to the ``__init__`` method of
the link extractor are returned.

Duplicate links are omitted if the ``unique`` attribute is set to ``True``,
otherwise they are returned.
rr   )r   r   xpathra   rg   r4   rU   extendro   r   r   rt   rD   )	rE   rj   rd   r0   subdocdocs	all_linksdocre   s	            r*   rk   LxmlLinkExtractor.extract_links  s      ) $ 4 4 41>>RSCTCT 4  D %%&D	C''\\8;L;LhWET0078  %%y.A.A.J.JKKs   #C,)	r   r   r   r   r   r   r   r   r   )r/   r/   r/   r/   r/   )rv   area)rw   FTNNr/   TN)r   _RegexOrSeveralTr   r   r   str | Iterable[str]r   r   r   r   r   r   r   r   r   r<   r   r<   r   ry   r   zstr | Iterable[str] | Noner   r   rB   r<   r   _RegexOrSeveralT | None)r   r   rz   zlist[re.Pattern[str]])r5   r   rz   r<   )r4   r%   rz   r<   r}   )r   r   r   r   rz   r{   r|   )r~   r   r   r   r   r   rI   staticmethodr   r   r   ro   rg   rk   r   r/   r,   r*   r   r      s   #%N #%!#-/,./1$1%."596:,.15)Y)Y )Y +	)Y
 *)Y -)Y ")Y #)Y )Y )Y 3)Y 4)Y *)Y )Y /)YV 
 
Q(09Cr,   r   )r)   r   rz   r   )r0   r   rz   r   )r5   r   rz   r%   )B__doc__
__future__r   loggingr=   r   collections.abcr   r   	functoolsr   typingr   r   r	   r
   urllib.parser   r   lxmlr   parsel.csstranslatorr   
w3lib.htmlr   	w3lib.urlr   r   scrapy.linkr   scrapy.linkextractorsr   r   r   scrapy.utils.miscr   r   scrapy.utils.pythonr   rt   scrapy.utils.responser   scrapy.utils.urlr   r   	lxml.htmlr   scrapyr   scrapy.httpr   	getLoggerr~   r[   r'   XPathr]   r+   r1   r6   r8   r%   r   _RegexTr   r   r/   r,   r*   <module>r      s    #   	 .  2 2 *  / - 7  M M ; 5 . J%( 
		8	$ 1++j1 ;a aH RZZ_$
%(7"334 x xr,   