
    !h                    :   S r SSKJr  SSKrSSKrSSKrSSKJr  SSKJ	r	J
r
  SSKJr  SSKJr  SSKJr  \R"                  " S	\5        \(       a  SS
KJr  SSKJr  SSKJrJr  SSKJr  \R6                  " \5      r " S S\5      r " S S\5      r  " S S\5      r!g)zS
Offsite Spider Middleware

See documentation in docs/topics/spider-middleware.rst
    )annotationsN)TYPE_CHECKING)Spidersignals)ScrapyDeprecationWarning)BaseSpiderMiddleware)urlparse_cachedzlThe scrapy.spidermiddlewares.offsite module is deprecated, use scrapy.downloadermiddlewares.offsite instead.)Self)Crawler)RequestResponse)StatsCollectorc                  r    \ rS rSr% S\S'   SS jr\SS j5       r      SS jrSS jr	SS jr
SS	 jrS
rg)OffsiteMiddleware%   r   crawlerc                    Xl         g N)stats)selfr   s     g/root/1688_scrapy/alibaba-scraper/venv/lib/python3.13/site-packages/scrapy/spidermiddlewares/offsite.py__init__OffsiteMiddleware.__init__(   s    %*
    c                    UR                   (       d   eU " UR                   5      nXl        UR                  R                  UR                  [        R                  S9  U$ )N)signal)r   r   r   connectspider_opened)clsr   os      r   from_crawlerOffsiteMiddleware.from_crawler+   sG    }}}	8M8MNr   c                   Uc  U$ U R                   R                  (       d   eUR                  (       dJ  UR                  R	                  S5      (       d*  U R                  XR                   R                  5      (       a  U$ [        U5      R                  nU(       a  X0R                  ;  av  U R                  R                  U5        [        R                  SX1S.SU R                   R                  0S9  U R                  R                  SU R                   R                  S9  U R                  R                  SU R                   R                  S9  g )	Nallow_offsitez3Filtered offsite request to %(domain)r: %(request)s)domainrequestspider)extrazoffsite/domains)r'   zoffsite/filtered)r   r'   dont_filtermetagetshould_followr	   hostnamedomains_seenaddloggerdebugr   	inc_value)r   r&   responser%   s       r   get_processed_request'OffsiteMiddleware.get_processed_request3   s    N||""""||00!!'<<+>+>??N )22f$5$55!!&)LLE!6!4!45  
 JJ  !24<<;N;N O

/8K8KLr   c                    U R                   n[        U5      R                  =(       d    Sn[        UR	                  U5      5      $ )N )
host_regexr	   r-   boolsearch)r   r&   r'   regexhosts        r   r,   OffsiteMiddleware.should_followL   s4    w'006BELL&''r   c                R   [        USS5      nU(       d  [        R                  " S5      $ [        R                  " S5      n[        R                  " S5      n/ nU H  nUc  M  UR                  U5      (       a#  SU S3n[        R
                  " U[        5        MA  UR                  U5      (       a#  SU S3n[        R
                  " U[        5        Mz  UR                  [        R                  " U5      5        M     S	S
R                  U5       S3n[        R                  " U5      $ )z<Override this method to implement a different offsite policyallowed_domainsNr7   z^https?://.*$z:\d+$zCallowed_domains accepts only domains, not URLs. Ignoring URL entry z in allowed_domains.zCallowed_domains accepts only domains without ports. Ignoring entry z	^(.*\.)?(|z)$)getattrrecompilematchwarningswarn
URLWarningr:   PortWarningappendescapejoin)	r   r'   r?   url_patternport_patterndomainsr%   messager;   s	            r   get_host_regex OffsiteMiddleware.get_host_regexR   s   !&*;TB::b>!jj!12zz(+%F~  ((**01EG  gz2$$V,,&&,X-AC  g{3ryy01! &" SXXg./r2zz%  r   c                N    U R                  U5      U l        [        5       U l        g r   )rP   r8   setr.   )r   r'   s     r   r   OffsiteMiddleware.spider_openedn   s    +/+>+>v+F&)er   )r.   r8   r   N)r   r   )r   r   returnr
   )r&   r   r3   zResponse | NonerU   zRequest | None)r&   r   r'   r   rU   r9   )r'   r   rU   zre.Pattern[str])r'   r   rU   None)__name__
__module____qualname____firstlineno____annotations__r   classmethodr!   r4   r,   rP   r   __static_attributes__ r   r   r   r   %   sM    +  *9	2(!8,r   r   c                      \ rS rSrSrg)rG   s   r^   NrW   rX   rY   rZ   r]   r^   r   r   rG   rG   s       r   rG   c                      \ rS rSrSrg)rH   w   r^   Nra   r^   r   r   rH   rH   w   rb   r   rH   )"__doc__
__future__r   loggingrB   rE   typingr   scrapyr   r   scrapy.exceptionsr   scrapy.spidermiddlewares.baser   scrapy.utils.httpobjr	   rF   typing_extensionsr
   scrapy.crawlerr   scrapy.httpr   r   scrapy.statscollectorsr   	getLoggerrW   r0   r   WarningrG   rH   r^   r   r   <module>rs      s    #  	    " 6 > 0 4 &&-5 
		8	$K,, K,\	 		' 	r   