
    !h                    f   S r SSKJr  SSKrSSKrSSKJr  SSKJrJ	r	  SSK
JrJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SS jr\(       a  SSKJr  SSKJr  \	\ \!\4   r"SS jr#SS jr$SS jr%SS jr&SS jr'SS jr(SS jr)SS jr*SS jr+    S            S!S jjr,g)"zW
This module contains general purpose URL functions not found in the standard
library.
    )annotationsN)import_module)TYPE_CHECKINGUnion)ParseResult	urldefragurlparse
urlunparse)warn)__all__)add_or_replace_parameter)
any_to_uri)	parse_url)ScrapyDeprecationWarningc           	         U SSS/[         Q7;   aD  U S:X  a  SOSn[        R                  " SU  SU SU  S	3[        5        [	        [        S
5      U 5      $ [        e)N_unquotepath_safe_charsr   	attributefunctionzThe scrapy.utils.url. z is deprecated, use w3lib.url.z	 instead.z	w3lib.url)_public_w3lib_objectswarningsr   r   getattrr   AttributeError)nameobj_types     W/root/1688_scrapy/alibaba-scraper/venv/lib/python3.13/site-packages/scrapy/utils/url.py__getattr__r      sj    {S=RSS"&-"7;Z#D68*4RSWRXXab$	
 }[1488
    )Iterable)Spiderc                   ^ [        U 5      R                  R                  5       mT(       d  gU Vs/ s H  o"R                  5       PM     nn[        U4S jU 5       5      $ s  snf )z:Return True if the url belongs to any of the given domainsFc              3  d   >#    U  H%  nTU:H  =(       d    TR                  S U 35      v   M'     g7f).Nendswith).0dhosts     r   	<genexpr>)url_is_from_any_domain.<locals>.<genexpr>1   s,     H1	6t}}qW56s   -0)
_parse_urlnetloclowerany)urldomainsr(   r)   s      @r   url_is_from_any_domainr2   +   sO    c?!!'')D")*'Qwwy'G*HHHH +s   A!c           	     H    [        XR                  /[        US/ 5      Q5      $ )z2Return True if the url belongs to the given spiderallowed_domains)r2   r   r   )r0   spiders     r   url_is_from_spiderr6   4   s(    !kkCGF,=rBC r   c                t   ^ [        U 5      R                  R                  5       m[        U4S jU 5       5      $ )z?Return True if the url ends with one of the extensions providedc              3  F   >#    U  H  nTR                  U5      v   M     g 7fNr%   )r'   extlowercase_paths     r   r*   (url_has_any_extension.<locals>.<genexpr>>   s     Bz~&&s++zs   !)r,   pathr.   r/   )r0   
extensionsr;   s     @r   url_has_any_extensionr?   ;   s,    _))//1NBzBBBr   c                    [        S[        SS9  [        U 5      u  pUR                  S5      (       d  U $ [	        USUSS 5      $ )a(  
Return the crawlable url

>>> escape_ajax("www.example.com/ajax.html#!key=value")
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html?#!key=value")
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html#!")
'www.example.com/ajax.html?_escaped_fragment_='

URLs that are not "AJAX crawlable" (according to Google) returned as-is:

>>> escape_ajax("www.example.com/ajax.html#key=value")
'www.example.com/ajax.html#key=value'
>>> escape_ajax("www.example.com/ajax.html#")
'www.example.com/ajax.html#'
>>> escape_ajax("www.example.com/ajax.html")
'www.example.com/ajax.html'
zKescape_ajax() is deprecated and will be removed in a future Scrapy version.   )
stacklevel!_escaped_fragment_   N)r   r   r   
startswith_add_or_replace_parameter)r0   defragfrags      r   escape_ajaxrJ   A   sL    , 	U 
 S>LF??3
$V-A48LLr   c                    [         R                  " SU [         R                  S9nU(       d$  [        U 5      nUR                  (       a  SOSnX0-   n U $ )z=Add http as the default scheme if it is missing from the url.z^\w+://flagszhttp:zhttp://)rematch
IGNORECASEr	   r-   )r0   rO   partsschemes       r   add_http_if_no_schemerS   b   s=    HHZBMM:E!LLilJr   c                \    [        [        R                  " SU [        R                  S95      $ )Na  
            ^                   # start with...
            (
                \.              # ...a single dot,
                (
                    \. | [^/\.]+  # optionally followed by
                )?                # either a second dot or some characters
                |
                ~   # $HOME
            )?      # optional match of ".", ".." or ".blabla"
            /       # at least one "/" for a file path,
            .       # and something after the "/"
            rL   )boolrN   rO   VERBOSEstrings    r   _is_posix_pathrY   m   s,    
 **	
 r   c                ~    [        [        R                  " SU [        R                  [        R                  -  S95      $ )Nzg
            ^
            (
                [a-z]:\\
                | \\\\
            )
            rL   )rU   rN   rO   rP   rV   rW   s    r   _is_windows_pathr[      s5    
 --"**,
	
 r   c                <    [        U 5      =(       d    [        U 5      $ r9   )rY   r[   rW   s    r   _is_filesystem_pathr]      s    &!=%5f%==r   c                N    [        U 5      (       a  [        U 5      $ [        U 5      $ )zSAdd an URL scheme if missing: file:// for filepath-like input or
http:// otherwise.)r]   _any_to_urirS   )r0   s    r   guess_schemer`      s%     33 %%r   c                6   [        U 5      nUR                  nU(       d  U(       a6  UR                  (       d  UR                  (       a  UR	                  S5      S   nU(       aL  UR
                  (       a;  UR                  UR
                  4S;   a  UR                  SUR
                   3S5      n[        UR                  UU(       a  SOUR                  U(       a  SOUR                  U(       a  SOUR                  U(       a  S45      $ UR                  45      $ )a|  Strip URL string from some of its components:

- ``strip_credentials`` removes "user:password@"
- ``strip_default_port`` removes ":80" (resp. ":443", ":21")
  from http:// (resp. https://, ftp://) URLs
- ``origin_only`` replaces path component with "/", also dropping
  query and fragment components ; it also strips credentials
- ``strip_fragment`` drops any #fragment component
@))httpP   )httpsi  )ftp   : /)r	   r-   usernamepasswordsplitportrR   replacer
   r=   paramsqueryfragment)r0   strip_credentialsstrip_default_portorigin_onlystrip_fragment
parsed_urlr-   s          r   	strip_urlry      s    " #JF[z22c"2& 	OO
0

 !JOO#4 5r:CJOOB:#4#4B:#3#3 B	
	 	 '1&9&9	
	 	r   )r   str)r0   UrlTr1   Iterable[str]returnrU   )r0   r{   r5   ztype[Spider]r}   rU   )r0   r{   r>   r|   r}   rU   )r0   rz   r}   rz   )rX   rz   r}   rU   )TTFT)r0   rz   rt   rU   ru   rU   rv   rU   rw   rU   r}   rz   )-__doc__
__future__r   rN   r   	importlibr   typingr   r   urllib.parser   r   r	   r
   r   	w3lib.urlr   r   r   rG   r   r_   r   r,   scrapy.exceptionsr   r   collections.abcr    scrapyr!   rz   bytesr{   r2   r6   r?   rJ   rS   rY   r[   r]   r`   ry    r   r   <module>r      s   
 # 	  # ' E E  6 K / - 6	 (S%$%ICMB, >& ##-	-- - 	-
 - 	-r   