
    !hi                       S r SSKJr  SSKrSSKrSSKrSSKJrJ	r	  SSK
JrJr  SSKJr  SSKrSSKrSSKJrJr  SS	KJrJrJr  SS
KJr  \R4                  R7                  S5      rSr\" SS9 " S S5      5       r " S S5      r\" 5       r  " S S5      r!\" \ RD                  5        S       SS jj5       r#\" \ RH                  5      S 5       r$ " S S5      r%SS jr&g)ai  `tldextract` accurately separates a URL's subdomain, domain, and public suffix.

It does this via the Public Suffix List (PSL).

    >>> import tldextract

    >>> tldextract.extract("http://forums.news.cnn.com/")
    ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)

    >>> tldextract.extract("http://forums.bbc.co.uk/")  # United Kingdom
    ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)

    >>> tldextract.extract("http://www.worldbank.org.kg/")  # Kyrgyzstan
    ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False)

Note subdomain and suffix are _optional_. Not all URL-like inputs have a
subdomain or a valid suffix.

    >>> tldextract.extract("google.com")
    ExtractResult(subdomain='', domain='google', suffix='com', is_private=False)

    >>> tldextract.extract("google.notavalidsuffix")
    ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_private=False)

    >>> tldextract.extract("http://127.0.0.1:8080/deployed/")
    ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)

To rejoin the original hostname, if it was indeed a valid, registered hostname:

    >>> ext = tldextract.extract("http://forums.bbc.co.uk")
    >>> ext.top_domain_under_public_suffix
    'bbc.co.uk'
    >>> ext.fqdn
    'forums.bbc.co.uk'
    )annotationsN)
CollectionSequence)	dataclassfield)wraps   )	DiskCacheget_cache_dir)lenient_netloclooks_like_iplooks_like_ipv6)get_suffix_listsTLDEXTRACT_CACHE_TIMEOUT)z4https://publicsuffix.org/list/public_suffix_list.datzQhttps://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.datT)orderc                      \ rS rSr% SrS\S'    S\S'    S\S'    S\S'    \" S	S
9rS\S'    \SS j5       r	\SS j5       r
\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       rSrg)ExtractResult=   a  A URL's extracted subdomain, domain, and suffix.

These first 3 fields are what most users of this library will care about.
They are the split, non-overlapping hostname components of the input URL.
They can be used to rebuild the original URL's hostname.

Beyond the first 3 fields, the class contains metadata fields, like a flag
that indicates if the input URL's suffix is from a private domain.
str	subdomaindomainsuffixbool
is_privateF)reprregistry_suffixc                    U R                   (       a[  U R                  (       d  U R                  (       a9  SR                  S U R                  U R                  U R                   4 5       5      $ g)zThe Fully Qualified Domain Name (FQDN), if there is a proper `domain` and `suffix`, or else the empty string.

>>> extract("http://forums.bbc.co.uk/path/to/file").fqdn
'forums.bbc.co.uk'
>>> extract("http://localhost:8080").fqdn
''
.c              3  6   #    U  H  o(       d  M  Uv   M     g 7fN ).0is     \/root/1688_scrapy/alibaba-scraper/venv/lib/python3.13/site-packages/tldextract/tldextract.py	<genexpr>%ExtractResult.fqdn.<locals>.<genexpr>y   s     W'Q!UVAA'Qs   
	 )r   r   r   joinr   selfs    r$   fqdnExtractResult.fqdno   sB     ;;DKK4??88WT[['QWWW    c                    U R                   (       aH  U R                  (       d7  U R                  (       d&  [        U R                   5      (       a  U R                   $ g)zThe IPv4 address, if that is what the input domain/URL was, or else the empty string.

>>> extract("http://127.0.0.1/path/to/file").ipv4
'127.0.0.1'
>>> extract("http://127.0.0.1.1/path/to/file").ipv4
''
>>> extract("http://256.1.1.1").ipv4
''
r'   )r   r   r   r   r)   s    r$   ipv4ExtractResult.ipv4|   s4     KK[[DNNdkk**;;r-   c                   Sn[        U R                  5      U:  ai  U R                  S   S:X  aV  U R                  S   S:X  aC  U R                  (       d2  U R                  (       d!  U R                  SS n[	        U5      (       a  U$ g)a  The IPv6 address, if that is what the input domain/URL was, or else the empty string.

>>> extract(
...     "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]/path/to/file"
... ).ipv6
'aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1'
>>> extract(
...     "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1.1]/path/to/file"
... ).ipv6
''
>>> extract("http://[aBcD:ef01:2345:6789:aBcD:ef01:256.0.0.1]").ipv6
''
   r   []r	   r'   )lenr   r   r   r   )r*   min_num_ipv6_charsdebracketeds      r$   ipv6ExtractResult.ipv6   sm      22A#%B3&[[DNN++a+K{++""r-   c                N    [         R                  " S[        SS9  U R                  $ )a  The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.

>>> extract("http://forums.bbc.co.uk").registered_domain
'bbc.co.uk'
>>> extract("http://localhost:8080").registered_domain
''

.. deprecated:: 6.0.0
   This property is deprecated and will be removed in the next major
   version. Use `top_domain_under_public_suffix` instead, which has the
   same behavior but a more accurate name.

This is an alias for the `top_domain_under_public_suffix` property.
`registered_domain` is so called because is roughly the domain the
owner paid to register with a registrar or, in the case of a private
domain, "registered" with the domain owner. If the input was not
something one could register, this property returns the empty string.

To distinguish the case of private domains, consider Blogspot, which is
in the PSL's private domains. If `include_psl_private_domains` was set
to `False`, the `registered_domain` property of a Blogspot URL
represents the domain the owner of Blogspot registered with a
registrar, i.e. Google registered "blogspot.com". If
`include_psl_private_domains=True`, the `registered_domain` property
represents the "blogspot.com" _subdomain_ the owner of a blog
"registered" with Blogspot.

>>> extract(
...     "http://waiterrant.blogspot.com", include_psl_private_domains=False
... ).registered_domain
'blogspot.com'
>>> extract(
...     "http://waiterrant.blogspot.com", include_psl_private_domains=True
... ).registered_domain
'waiterrant.blogspot.com'

To always get the same joined string, regardless of the
`include_psl_private_domains` setting, consider the
`top_domain_under_registry_suffix` property.
zThe 'registered_domain' property is deprecated and will be removed in the next major version. Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.   )
stacklevel)warningswarnDeprecationWarningtop_domain_under_public_suffixr)   s    r$   registered_domainExtractResult.registered_domain   s+    T 	r		
 222r-   c                    U R                   U R                  /nU R                  (       a3  UR                  [	        U R                  R                  S5      5      5        SR                  U5      $ )a  The domain name in Reverse Domain Name Notation.

Joins extracted components of the input URL in reverse domain name
notation. The suffix is used as the leftmost component, followed by the
domain, then followed by the subdomain with its parts reversed.

Reverse Domain Name Notation is typically used to organize namespaces
for packages and plugins. Technically, a full reversal would reverse
the parts of the suffix, e.g. "co.uk" would become "uk.co", but this is
not done in practice when Reverse Domain Name Notation is called for.
So this property leaves the `suffix` part in its original order.

>>> extract("login.example.com").reverse_domain_name
'com.example.login'

>>> extract("login.example.co.uk").reverse_domain_name
'co.uk.example.login'
r   )r   r   r   extendreversedsplitr(   )r*   stacks     r$   reverse_domain_name!ExtractResult.reverse_domain_name   sK    ( dkk*>>LL$.."6"6s";<=xxr-   c                    U R                   nU(       a  U R                  (       d  U$ U R                  R                  S5      S-   nSR	                  UR                  S5      U* S 5      $ )a  The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string.

The rightmost domain label might be in the `domain` field, or, if the
input URL's suffix is a PSL private domain, in the public suffix
`suffix` field.

If the input was not in the PSL's private domains, this property is
equivalent to `top_domain_under_public_suffix`.

>>> extract(
...     "http://waiterrant.blogspot.com", include_psl_private_domains=False
... ).top_domain_under_registry_suffix
'blogspot.com'
>>> extract(
...     "http://waiterrant.blogspot.com", include_psl_private_domains=True
... ).top_domain_under_registry_suffix
'blogspot.com'
>>> extract("http://localhost:8080").top_domain_under_registry_suffix
''
r   r<   N)rA   r   r   countr(   rG   )r*   rA   
num_labelss      r$    top_domain_under_registry_suffix.ExtractResult.top_domain_under_registry_suffix   s^    , *.)L)L&-T__11))//4q8
xx6<<SA:+,OPPr-   c                ~    U R                   (       a,  U R                  (       a  U R                   SU R                    3$ g)zThe `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.

>>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix
'bbc.co.uk'
>>> extract("http://localhost:8080").top_domain_under_public_suffix
''
r   r'   )r   r   r)   s    r$   rA   ,ExtractResult.top_domain_under_public_suffix  s-     ;;4;;kk]!DKK=11r-   r!   N)returnr   )__name__
__module____qualname____firstlineno____doc____annotations__r   r   propertyr+   r/   r9   rB   rI   rN   rA   __static_attributes__r!   r-   r$   r   r   =   s     N|K K  !e,OS, 
 
  $  4 /3 /3b  0 Q Q8 
 
r-   r   c                     \ rS rSrSr\" 5       \SSS\4             SS jjr  S       SS jjr	  S       SS	 jjr
  S       SS
 jjr S       SS jjr S     SS jjr\SSS jj5       r S   SS jjrSrg)
TLDExtracti   zOA callable for extracting, subdomain, domain, and suffix components from a URL.TFr!   c                P   U=(       d    Sn[        S U 5       5      U l        X0l        U R                  (       d#  U(       d  U R                  (       d  [        S5      eX@l        XPl        SU l        [        U[        5      (       a  [        U5      OUU l
        [        U5      U l        g)a  Construct a callable for extracting subdomain, domain, and suffix components from a URL.

Upon calling it, it first checks for a JSON in `cache_dir`. By default,
the `cache_dir` will live in the tldextract directory. You can disable
the caching functionality of this module by setting `cache_dir` to `None`.

If the cached version does not exist, such as on the first run, HTTP
request the URLs in `suffix_list_urls` in order, and use the first
successful response for public suffix definitions. Subsequent, untried
URLs are ignored. The default URLs are the latest version of the
Mozilla Public Suffix List and its mirror, but any similar document URL
could be specified. Local files can be specified by using the `file://`
protocol (see `urllib2` documentation). To disable HTTP requests, set
this to an empty sequence.

If there is no cached version loaded and no data is found from the `suffix_list_urls`,
the module will fall back to the included TLD set snapshot. If you do not want
this behavior, you may set `fallback_to_snapshot` to False, and an exception will be
raised instead.

The Public Suffix List includes a list of "private domains" as TLDs,
such as blogspot.com. These do not fit `tldextract`'s definition of a
suffix, so these domains are excluded by default. If you'd like them
included instead, set `include_psl_private_domains` to True.

You can specify additional suffixes in the `extra_suffixes` argument.
These will be merged into whatever public suffix definitions are
already in use by `tldextract`, above.

cache_fetch_timeout is passed unmodified to the underlying request object
per the requests documentation here:
http://docs.python-requests.org/en/master/user/advanced/#timeouts

cache_fetch_timeout can also be set to a single value with the
environment variable TLDEXTRACT_CACHE_TIMEOUT, like so:

TLDEXTRACT_CACHE_TIMEOUT="1.2"

When set this way, the same timeout value will be used for both connect
and read timeouts
r!   c              3  n   #    U  H+  oR                  5       (       d  M  UR                  5       v   M-     g 7fr    )strip)r"   urls     r$   r%   &TLDExtract.__init__.<locals>.<genexpr>W  s$      &
#3Cyy{KCIIKK#3s   55zThe arguments you have provided disable all ways for tldextract to obtain data. Please provide a suffix list data, a cache_dir, or set `fallback_to_snapshot` to `True`.N)tuplesuffix_list_urlsfallback_to_snapshot
ValueErrorinclude_psl_private_domainsextra_suffixes
_extractor
isinstancer   floatcache_fetch_timeoutr
   _cache)r*   	cache_dirrc   rd   rf   rg   rk   s          r$   __init__TLDExtract.__init__$  s    d ,1r % &
#3&
 !
 %9!%%d6O6O;  ,G(,@D -s33 %&$ 	 
  	*r-   Nc                "    U R                  XUS9$ )zAlias for `extract_str`.session)extract_strr*   r`   rf   rr   s       r$   __call__TLDExtract.__call__n  s     'RRr-   c                4    U R                  [        U5      X#S9$ )aa  Take a string URL and splits it into its subdomain, domain, and suffix components.

I.e. its effective TLD, gTLD, ccTLD, etc. components.

>>> extractor = TLDExtract()
>>> extractor.extract_str("http://forums.news.cnn.com/")
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
>>> extractor.extract_str("http://forums.bbc.co.uk/")
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)

Allows configuring the HTTP request via the optional `session`
parameter. For example, if you need to use a HTTP proxy. See also
`requests.Session`.

>>> import requests
>>> session = requests.Session()
>>> # customize your session here
>>> with session:
...     extractor.extract_str("http://forums.news.cnn.com/", session=session)
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
rq   )_extract_netlocr   rt   s       r$   rs   TLDExtract.extract_strw  s'    6 ##3!< $ 
 	
r-   c                6    U R                  UR                  X#S9$ )a  Take the output of urllib.parse URL parsing methods and further splits the parsed URL.

Splits the parsed URL into its subdomain, domain, and suffix
components, i.e. its effective TLD, gTLD, ccTLD, etc. components.

This method is like `extract_str` but faster, as the string's domain
name has already been parsed.

>>> extractor = TLDExtract()
>>> extractor.extract_urllib(
...     urllib.parse.urlsplit("http://forums.news.cnn.com/")
... )
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
>>> extractor.extract_urllib(urllib.parse.urlsplit("http://forums.bbc.co.uk/"))
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
rq   )rx   netlocrt   s       r$   extract_urllibTLDExtract.extract_urllib  s&    , ##JJ3 $ 
 	
r-   c                   UR                  SS5      R                  SS5      R                  SS5      nSn[        U5      U:  a2  US   S:X  a)  US   S	:X  a   [        US
S 5      (       a  [        SUSSSS9$ UR	                  S5      nU R                  U5      R                  XbS9nSnU(       d,  [        U5      U:X  a  [        U5      (       a  [        SUSSSS9$ U(       d"  [        SR                  US S 5      US   SSSS9$ Uu  u  pu  pU	S:  a  SR                  US U	S
-
   5      OSnU	S:  a  XiS
-
     OSnSR                  XiS  5      nU
R                  (       a  SR                  XkS  5      OUn[        UUUU
R                  US9$ )Nu   。r   u   ．u   ｡r2   r   r3   r4   r5   r	   r'   F)r   r   )rf   )r   r   r   r   r   r<   )
replacer6   r   r   rG   _get_tld_extractorsuffix_indexr   r(   r   )r*   r{   rf   rr   netloc_with_ascii_dotsr7   labelsmaybe_indexesnum_ipv4_labelspublic_suffix_indexpublic_suffix_noderegistry_suffix_indexregistry_suffix_noder   r   public_suffixr   s                    r$   rx   TLDExtract._extract_netloc  s    NN8X.WXx(WXx( 	 &'+==&q)S0&r*c1 6q <== *B5RT  (--c2//8EE F 
 F.455 *B5RT   ((6#2;/bz  "  	
5 9"
 #a' HHV51A567 	
 5H!4Ka/0QS(<!=> ",, HHV234 	
  )44+
 	
r-   c                t    SU l         U R                  R                  5         U(       a  U R                  US9  gg)z/Force fetch the latest suffix list definitions.Nrq   )rh   rl   clearr   )r*   	fetch_nowrr   s      r$   updateTLDExtract.update  s4     ##G#4 r-   c                N    [        U R                  US9R                  5       5      $ )zoThe list of TLDs used by default.

This will vary based on `include_psl_private_domains` and `extra_suffixes`.
rq   )listr   tlds)r*   rr   s     r$   r   TLDExtract.tlds  s&     D++G+<AACDDr-   c                l   U R                   (       a  U R                   $ [        U R                  U R                  U R                  U R
                  US9u  p#[        X#U R                  /5      (       d  [        S5      e[        UU[        U R                  5      U R                  S9U l         U R                   $ )zGet or compute this object's TLDExtractor.

Looks up the TLDExtractor in roughly the following order, based on the
settings passed to __init__:

1. Memoized on `self`
2. Local system _cache file
3. Remote PSL, over HTTP
4. Bundled PSL snapshot file
)cacheurlsrk   rd   rr   z)No tlds set. Cannot proceed without tlds.)public_tldsprivate_tlds
extra_tldsrf   )rh   r   rl   rc   rk   rd   anyrg   re   _PublicSuffixListTLDExtractorr   rf   )r*   rr   r   r   s       r$   r   TLDExtract._get_tld_extractor	  s     ????"$4++&& $ 8 8!%!:!:%
! Kt/B/BCDDHII7#%D//0(,(H(H	
 r-   )rl   rh   rk   rg   rd   rf   rc   )rm   z
str | Nonerc   Sequence[str]rd   r   rf   r   rg   r   rk   zstr | float | NonerR   None)NNr`   r   rf   bool | Nonerr   requests.Session | NonerR   r   )r`   z3urllib.parse.ParseResult | urllib.parse.SplitResultrf   r   rr   r   rR   r   r    )r{   r   rf   r   rr   r   rR   r   FN)r   r   rr   r   rR   r   )rr   r   rR   	list[str])rr   r   rR   r   )rS   rT   rU   rV   rW   r   PUBLIC_SUFFIX_LIST_URLSCACHE_TIMEOUTrn   ru   rs   r|   rx   r   rY   r   r   rZ   r!   r-   r$   r\   r\      s   Y
 !.*A%),1(*2?H+H+ (H+ #	H+
 &*H+ &H+ 0H+ 
H+Z 48+/	SS &1S )	S
 
S 48+/	

 &1
 )	

 

D 48+/	
@
 &1
 )	

 

< ,0	F
F
 &1F
 )	F

 
F
R KO550G5	5 E E 26!.!	&! !r-   r\   c                  l    \ rS rSrSr   S       S	S jjr\ S
     SS jj5       rSSS jjrSr	g)Triei0  z:Trie for storing eTLDs with their labels in reverse-order.Nc                <    U(       a  UO0 U l         X l        X0l        g)zTODO.N)matchesendr   )r*   r   r   r   s       r$   rn   Trie.__init__3  s     #*wr$r-   c                    [        5       nU  H  nUR                  U5        M     Uc  / nU H  nUR                  US5        M     U$ )z?Create a Trie from a list of suffixes and return its root node.T)r   
add_suffix)public_suffixesprivate_suffixes	root_noder   s       r$   createTrie.create>  sS     F	%F  ( & #!&F  . ' r-   c                    U nUR                  S5      nUR                  5         U H8  nXSR                  ;  a  [        5       UR                  U'   UR                  U   nM:     SUl        X#l        g)z+Append a suffix's labels to this Trie node.r   TN)rG   reverser   r   r   r   )r*   r   r   noder   labels         r$   r   Trie.add_suffixQ  s_    c"ELL(&*fU#<<&D 
 $r-   )r   r   r   )NFF)r   zdict[str, Trie] | Noner   r   r   r   rR   r   r    )r   zCollection[str]r   zCollection[str] | NonerR   r   F)r   r   r   r   rR   r   )
rS   rT   rU   rV   rW   rn   staticmethodr   r   rZ   r!   r-   r$   r   r   0  sx    D +/ 		%'	% 	% 		%
 
	%  48(0 
 $% %r-   r   c                    [        XUS9$ )N)rf   rr   )TLD_EXTRACTOR)r`   rf   rr   s      r$   extractr   a  s     g r-   c                 .    [         R                  " U 0 UD6$ r    )r   r   )argskwargss     r$   r   r   l  s    000r-   c                  ^    \ rS rSrSr S       S	S jjrS
SS jjr S
     SS jjrSrg)r   iq  z8Wrapper around this project's main algo for PSL lookups.c                   X@l         Xl        X l        [        X-   U-   5      U l        [        X-   5      U l        [        R                  U R
                  [        U5      5      U l        [        R                  U R
                  5      U l	        g r    )
rf   r   r   	frozensettlds_incl_privatetlds_excl_privater   r   tlds_incl_private_trietlds_excl_private_trie)r*   r   r   r   rf   s        r$   rn   &_PublicSuffixListTLDExtractor.__init__t  sv     ,G(&(!*;+E
+R!S!*;+C!D&*kk""Il$;'
# '+kk$2H2H&I#r-   Nc                ^    Uc  U R                   nU(       a  U R                  $ U R                  $ )z,Get the currently filtered list of suffixes.)rf   r   r   )r*   rf   s     r$   r   "_PublicSuffixListTLDExtractor.tlds  s9    &.*.*J*J' + ""	
 ''	
r-   c                   Uc  U R                   nU(       a  U R                  OU R                  =p4[        U5      =n=pg[	        U5       H  n[        U5      n	XR                  ;   a>  US-  nUR                  U	   nUR                  (       a  UnUR                  (       d  UnUnM[  SUR                  ;   n
U
(       a5  SU	-   UR                  ;   nU(       a  UOUS-
  UR                  S   4UU44s  $   O   U[        U5      :X  a  gXS4Xd44$ )zReturn the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes.

Returns `None` if no suffix is found.
Nr	   *!)	rf   r   r   r6   rF   _decode_punycoder   r   r   )r*   splrf   r   reg_node
suffix_idxreg_idx	label_idxr   decoded_labelis_wildcardis_wildcard_exceptions               r$   r   *_PublicSuffixListTLDExtractor.suffix_index  s!    '.*.*J*J' + '',,	

 ,/s83
3Wc]E,U3M,Q	||M288!*J??#'"+-K(+m(;t||(K%!6IIMLL%   / #2 S!"W$788r-   )rf   r   r   r   r   r   r   r   )r   r   r   r   r   r   rf   r   r    )rf   r   rR   zfrozenset[str])r   r   rf   r   rR   z0tuple[tuple[int, Trie], tuple[int, Trie]] | None)	rS   rT   rU   rV   rW   rn   r   r   rZ   r!   r-   r$   r   r   q  sh    B -2JJ  J 	J
 &*J$	
 JN,9,9;F,9	9,9 ,9r-   r   c                    U R                  5       nUR                  S5      nU(       a   [        R                  " U5      $ U$ ! [        [
        4 a     U$ f = f)Nzxn--)lower
startswithidnadecodeUnicodeError
IndexError)r   loweredlooks_like_punys      r$   r   r     sY    kkmG((0O	;;w'' N j) 	N	s   A AAr   r   )r   r   rR   r   )'rW   
__future__r   osurllib.parseurllibr>   collections.abcr   r   dataclassesr   r   	functoolsr   r   requestsr   r
   r   remoter   r   r   suffix_listr   environgetr   r   r   r\   r   r   ru   r   r   r   r   r!   r-   r$   <module>r      s  "H # 	   0 (    + B B )

9:  _ _ _DJ JZ .% .%b } 05'+	!, % 	  }1 1L9 L9^r-   