
    !h]                        S SK Jr  S SKrS SKrS SKrS SKJr  S SKJrJ	r	J
r
  S SKJr  S SKJr  \(       a  S SKJr  S SKJr  S S	KJr  \R*                  " \5      r " S
 S5      rg)    )annotationsN)TYPE_CHECKING)RequestSpidersignals)IgnoreRequest)urlparse_cached)Self)Crawler)StatsCollectorc                  d    \ rS rSr\SS j5       rSS jrSS jrSS jrSS jr	SS jr
SS jrS	rg
)OffsiteMiddleware   c                   UR                   (       d   eU " UR                   5      nUR                  R                  UR                  [        R                  S9  UR                  R                  UR                  [        R                  S9  U$ )N)signal)statsr   connectspider_openedrequest_scheduled)clscrawleros      k/root/1688_scrapy/alibaba-scraper/venv/lib/python3.13/site-packages/scrapy/downloadermiddlewares/offsite.pyfrom_crawlerOffsiteMiddleware.from_crawler   se    }}}8M8MN 3 3G<U<UV    c                .    Xl         [        5       U l        g N)r   setdomains_seen)selfr   s     r   __init__OffsiteMiddleware.__init__    s    
&)er   c                0    U R                  U5      U l        g r   )get_host_regex
host_regex)r!   spiders     r   r   OffsiteMiddleware.spider_opened$   s    +/+>+>v+Fr   c                &    U R                  X5        g r   )process_request)r!   requestr'   s      r   r   #OffsiteMiddleware.request_scheduled'   s    W-r   c                   UR                   (       d6  UR                  R                  S5      (       d  U R                  X5      (       a  g [	        U5      R
                  nU(       a]  X0R                  ;  aN  U R                  R                  U5        [        R                  SX1S.SU0S9  U R                  R                  SUS9  U R                  R                  SUS9  [        e)	Nallow_offsitez3Filtered offsite request to %(domain)r: %(request)s)domainr+   r'   )extrazoffsite/domains)r'   zoffsite/filtered)dont_filtermetagetshould_followr	   hostnamer    addloggerdebugr   	inc_valuer   )r!   r+   r'   r/   s       r   r*   !OffsiteMiddleware.process_request*   s    ||00!!'22 )22f$5$55!!&)LLE!6(  
 JJ  !26 B

/?r   c                    U R                   n[        U5      R                  =(       d    Sn[        UR	                  U5      5      $ )N )r&   r	   r5   boolsearch)r!   r+   r'   regexhosts        r   r4   OffsiteMiddleware.should_follow=   s4    w'006BELL&''r   c                >   [        USS5      nU(       d  [        R                  " S5      $ [        R                  " S5      n[        R                  " S5      n/ nU H  nUc  M  UR                  U5      (       a  SU S3n[        R
                  " U5        M<  UR                  U5      (       a  SU S3n[        R
                  " U5        Mp  UR                  [        R                  " U5      5        M     S	S
R                  U5       S3n[        R                  " U5      $ )z<Override this method to implement a different offsite policyallowed_domainsNr<   z^https?://.*$z:\d+$zCallowed_domains accepts only domains, not URLs. Ignoring URL entry z in allowed_domains.zCallowed_domains accepts only domains without ports. Ignoring entry z	^(.*\.)?(|z)$)
getattrrecompilematchwarningswarnr>   appendescapejoin)	r!   r'   rC   url_patternport_patterndomainsr/   messager?   s	            r   r%    OffsiteMiddleware.get_host_regexC   s   !&*;TB::b>!jj!12zz(+%F~  ((**01EG  g&$$V,,&&,X-AC  g&ryy01! &" SXXg./r2zz%  r   )r    r&   r   N)r   r   returnr
   )r   r   )r'   r   rS   None)r+   r   r'   r   rS   rT   )r+   r   r'   r   rS   r=   )r'   r   rS   zre.Pattern[str])__name__
__module____qualname____firstlineno__classmethodr   r"   r   r   r*   r4   r%   __static_attributes__ r   r   r   r      s3     ,G.&(!r   r   )
__future__r   loggingrF   rI   typingr   scrapyr   r   r   scrapy.exceptionsr   scrapy.utils.httpobjr	   typing_extensionsr
   scrapy.crawlerr   scrapy.statscollectorsr   	getLoggerrU   r7   r   r[   r   r   <module>rf      sJ    "  	    + + + 0&&5 
		8	$F! F!r   