
    "C              
          d dl mZ d dlmZmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZ  G d d      Z G d d	e      Z G d
 de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z dee!   dee!   de"fdZ# ed      	 d#de!de$d e"de$fd!       Z%y")$    )	lru_cache)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thairemove_accentunicode_rangec                   N    e Zd ZdZdedefdZdeddfdZd	dZe	de
fd       Zy)
MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                     t         )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr   s     0platform/bq/third_party/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible   
     "!    Nc                     t         )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r   s     r!   feedzMessDetectorPlugin.feed%   s
    
 "!r$   c                     t         )zB
        Permit to reset the plugin to the initial state.
        r   r    s    r!   resetzMessDetectorPlugin.reset,   r#   r$   c                     t         )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r(   s    r!   ratiozMessDetectorPlugin.ratio2   s
     "!r$   r   N)__name__
__module____qualname____doc__strboolr"   r&   r)   propertyfloatr+    r$   r!   r   r      sM    
"# "$ ""c "d "" "u " "r$   r   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
 TooManySymbolOrPunctuationPluginr   Nc                 J    d| _         d| _        d| _        d | _        d| _        y )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr(   s    r!   __init__z)TooManySymbolOrPunctuationPlugin.__init__<   s*    "# !$(!&+#r$   r   c                 "    |j                         S Nisprintabler   s     r!   r"   z)TooManySymbolOrPunctuationPlugin.eligibleD       $$&&r$   c                 8   | xj                   dz  c_         || j                  k7  ro|t        vrgt        |      r| xj                  dz  c_        || _        y |j                         du r-t        |      r"t        |      du r| xj                  dz  c_        || _        y )Nr   F   )	r;   r<   r   r   r9   isdigitr   r   r:   r   s     r!   r&   z%TooManySymbolOrPunctuationPlugin.feedG   s    " 222!==i(''1,' %.! !!#u,i(	*e3""a'"$-!r$   c                 .    d| _         d| _        d| _        y Nr   )r9   r;   r:   r(   s    r!   r)   z&TooManySymbolOrPunctuationPlugin.resetY   s    "# !r$   c                     | j                   dk(  ry| j                  | j                  z   | j                   z  }|dk\  r|S dS )Nr           g333333?)r;   r9   r:   )r    ratio_of_punctuations     r!   r+   z&TooManySymbolOrPunctuationPlugin.ratio^   sO      A% ##d&8&88!! " (<s'B#KKr$   r,   r-   r.   r/   r>   r1   r2   r"   r&   r)   r3   r4   r+   r5   r$   r!   r7   r7   ;   sP    ,'# '$ '.c .d .$
 Lu L Lr$   r7   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
TooManyAccentuatedPluginr   Nc                      d| _         d| _        y rH   r;   _accentuated_countr(   s    r!   r>   z!TooManyAccentuatedPlugin.__init__k        !"#r$   r   c                 "    |j                         S r@   )isalphar   s     r!   r"   z!TooManyAccentuatedPlugin.eligibleo   s      ""r$   c                 p    | xj                   dz  c_         t        |      r| xj                  dz  c_        y y Nr   )r;   r	   rQ   r   s     r!   r&   zTooManyAccentuatedPlugin.feedr   s1    ")$##q(# %r$   c                      d| _         d| _        y rH   rP   r(   s    r!   r)   zTooManyAccentuatedPlugin.resetx   rR   r$   c                 f    | j                   dk(  ry| j                  | j                   z  }|dk\  r|S dS )Nr   rJ   gffffff?rP   )r    ratio_of_accentuations     r!   r+   zTooManyAccentuatedPlugin.ratio|   sA      A%##d&;&;; 	 )>(E$N3Nr$   r,   rL   r5   r$   r!   rN   rN   j   sP    $## #$ #)c )d )$ Ou O Or$   rN   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
UnprintablePluginr   Nc                      d| _         d| _        y rH   )_unprintable_countr;   r(   s    r!   r>   zUnprintablePlugin.__init__   s    "# !r$   r   c                      yNTr5   r   s     r!   r"   zUnprintablePlugin.eligible       r$   c                     |j                         du r,|j                         du r|dk7  r| xj                  dz  c_        | xj                  dz  c_        y )NFr   )isspacerB   r]   r;   r   s     r!   r&   zUnprintablePlugin.feed   sN    5(%%'50V###q(#"r$   c                     d| _         y rH   )r]   r(   s    r!   r)   zUnprintablePlugin.reset   s
    "#r$   c                 Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rJ      )r;   r]   r(   s    r!   r+   zUnprintablePlugin.ratio   s/      A%''!+t/D/DDDr$   r,   rL   r5   r$   r!   r[   r[      sP    "# $ #c #d #$ Eu E Er$   r[   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
SuspiciousDuplicateAccentPluginr   Nc                 .    d| _         d| _        d | _        y rH   _successive_countr;   _last_latin_characterr(   s    r!   r>   z(SuspiciousDuplicateAccentPlugin.__init__   s    !" !%)"r$   r   c                 <    |j                         xr t        |      S r@   )rT   r   r   s     r!   r"   z(SuspiciousDuplicateAccentPlugin.eligible   s      ":x	'::r$   c                 ~   | xj                   dz  c_         | j                  t        |      rt        | j                        ru|j                         r/| j                  j                         r| xj                  dz  c_        t        |      t        | j                        k(  r| xj                  dz  c_        || _        y rV   )r;   rl   r	   isupperrk   r   r   s     r!   r&   z$SuspiciousDuplicateAccentPlugin.feed   s    "%%1i(^D<V<V-W$$&4+E+E+M+M+O**a/* +}..0  **a/*%."r$   c                 .    d| _         d| _        d | _        y rH   rj   r(   s    r!   r)   z%SuspiciousDuplicateAccentPlugin.reset   s    !" !%)"r$   c                 Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rJ   rE   )r;   rk   r(   s    r!   r+   z%SuspiciousDuplicateAccentPlugin.ratio   s/      A%&&*d.C.CCCr$   r,   rL   r5   r$   r!   rh   rh      sP    *;# ;$ ;/c /d /*
 Du D Dr$   rh   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
SuspiciousRanger   Nc                 .    d| _         d| _        d | _        y rH   )"_suspicious_successive_range_countr;   _last_printable_seenr(   s    r!   r>   zSuspiciousRange.__init__   s    23/ !$(!r$   r   c                 "    |j                         S r@   rA   r   s     r!   r"   zSuspiciousRange.eligible   rC   r$   c                 <   | xj                   dz  c_         |j                         st        |      s|t        v rd | _        y | j                  || _        y t        | j                        }t        |      }t        ||      r| xj                  dz  c_        || _        y rV   )r;   rc   r   r   rv   r    is_suspiciously_successive_rangeru   )r    r   unicode_range_aunicode_range_bs       r!   r&   zSuspiciousRange.feed   s    " i(88(,D%$$,(1D%'%%
 (	2+O_M33q83$-!r$   c                 .    d| _         d| _        d | _        y rH   )r;   ru   rv   r(   s    r!   r)   zSuspiciousRange.reset   s     !23/$(!r$   c                 j    | j                   dk(  ry| j                  dz  | j                   z  }|dk  ry|S )Nr   rJ   rE   g?)r;   ru   )r    ratio_of_suspicious_range_usages     r!   r+   zSuspiciousRange.ratio   sG      A% 33a7!!+"' +S0..r$   r,   rL   r5   r$   r!   rs   rs      sM    )
'# '$ '.c .d .2)
 /u / /r$   rs   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
SuperWeirdWordPluginr   Nc                 t    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        y )Nr   F )_word_count_bad_word_count_is_current_word_bad_foreign_long_watchr;   _bad_character_count_buffer_buffer_accent_countr(   s    r!   r>   zSuperWeirdWordPlugin.__init__   sA     $)!#(  !$%!$%!r$   r   c                      yr_   r5   r   s     r!   r"   zSuperWeirdWordPlugin.eligible	  r`   r$   c                 $   |j                         rdj                  | j                  |g      | _        t        |      r| xj                  dz  c_        | j
                  du rUt        |      du rHt        |      du r;t        |      du r.t        |      du r!t        |      du rt        |      du rd| _        y | j                  sy |j                         st        |      st        |      r| j                  r| xj                  dz  c_        t!        | j                        }| xj"                  |z  c_        |dk\  r| j                  |z  dkD  rd| _        |dk\  r| j
                  rd| _        | j$                  rD| xj&                  dz  c_        | xj(                  t!        | j                        z  c_        d| _        d| _        d| _        d| _        y |d	vr<|j+                         du r)t-        |      rd| _        | xj                  |z  c_        y y y y )
Nr   r   FT   g(\?   r   >   -<=>)rT   joinr   r	   r   r   r   r   r   r   r   r   rc   r   r   r   lenr;   r   r   r   rF   r   )r    r   buffer_lengths      r!   r&   zSuperWeirdWordPlugin.feed  s   77DLL)#<=DLi())Q.)((E1Y'509%.i(E1	*e3	*e3I&%/+/(||>)#<Y@Wll!-M!!]2!!d&?&?-&ORV&V,0)"t'?'?,0)(($$)$))S->>),1)',D$DL()D%11!!#u,)$(,D%LLI%L % - 2r$   c                 f    d| _         d| _        d| _        d| _        d| _        d| _        d| _        y )Nr   Fr   )r   r   r   r   r   r;   r   r(   s    r!   r)   zSuperWeirdWordPlugin.reset;  s9    $)!#(   !$%!r$   c                 T    | j                   dk  ry| j                  | j                  z  S )N
   rJ   )r   r   r;   r(   s    r!   r+   zSuperWeirdWordPlugin.ratioD  s*    r!((4+@+@@@r$   r,   rL   r5   r$   r!   r   r      sQ    
&# $ -&c -&d -&^& Au A Ar$   r   c                   V    e Zd ZdZd
dZdedefdZdeddfdZd
dZ	e
defd	       Zy)CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   Nc                      d| _         d| _        y rH   _wrong_stop_count_cjk_character_countr(   s    r!   r>   zCjkInvalidStopPlugin.__init__R      !"$%!r$   r   c                      yr_   r5   r   s     r!   r"   zCjkInvalidStopPlugin.eligibleV  r`   r$   c                 z    |dv r| xj                   dz  c_         y t        |      r| xj                  dz  c_        y y )N)u   丅u   丄r   )r   r   r   r   s     r!   r&   zCjkInvalidStopPlugin.feedY  s<    &""a'")%%*% r$   c                      d| _         d| _        y rH   r   r(   s    r!   r)   zCjkInvalidStopPlugin.reset`  r   r$   c                 T    | j                   dk  ry| j                  | j                   z  S )N   rJ   )r   r   r(   s    r!   r+   zCjkInvalidStopPlugin.ratiod  s*    $$r)%%(A(AAAr$   r,   )r-   r.   r/   r0   r>   r1   r2   r"   r&   r)   r3   r4   r+   r5   r$   r!   r   r   L  sU    
&# $ +c +d +& Bu B Br$   r   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
ArchaicUpperLowerPluginr   Nc                 f    d| _         d| _        d| _        d| _        d| _        d | _        d| _        y )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr;   _last_alpha_seen_current_ascii_onlyr(   s    r!   r>   z ArchaicUpperLowerPlugin.__init__l  s9    	/0,-.*340 ! $#' r$   r   c                      yr_   r5   r   s     r!   r"   z ArchaicUpperLowerPlugin.eligibley  r`   r$   c                 P   |j                         xr t        |      }|du }|r| j                  dkD  r| j                  dk  r?|j                         du r-| j                  du r| xj
                  | j                  z  c_        d| _        d| _        d | _        d| _        | xj                  dz  c_	        d| _        y | j                  du rt        |      du rd| _        | j                  |j                         r| j                  j                         s*|j                         rM| j                  j                         r3| j                  du r| xj                  dz  c_        d| _        nd| _        nd| _        | xj                  dz  c_	        | xj                  dz  c_        || _        y )NFr   @   r   TrE   )rT   r   r   rF   r   r   r   r   r   r;   r
   ro   islower)r    r   is_concerned	chunk_seps       r!   r&   zArchaicUpperLowerPlugin.feed|  s}    ((*J/?	/J E)	==A44:%%'50,,588668 23D.34D0$(D!DI!!Q&!'+D$##t+0Cu0L',D$  ,!!#(=(=(E(E(G!!#(=(=(E(E(G99$66!;6 %DI $DI!	",,1, )r$   c                 f    d| _         d| _        d| _        d| _        d | _        d| _        d| _        y )Nr   FT)r;   r   r   r   r   r   r   r(   s    r!   r)   zArchaicUpperLowerPlugin.reset  s9     !/0,-.*340 $	#' r$   c                 T    | j                   dk(  ry| j                  | j                   z  S )Nr   rJ   )r;   r   r(   s    r!   r+   zArchaicUpperLowerPlugin.ratio  s*      A%77$:O:OOOr$   r,   rL   r5   r$   r!   r   r   k  sQ    (# $ (*c (*d (*T( Pu P Pr$   r   rz   r{   r   c                 ^   | |y| |k(  ryd| v rd|v ryd| v sd|v ry| j                  d      |j                  d      }}|D ]  }|t        v r||v s y | dv |dv }}|s|rd| v sd|v ry|r|ryd| v sd|v rd| v sd|v ry| d	k(  s|d	k(  ryd| v sd|v s| d
v r|d
v rd| v sd|v ryd| v sd|v ryy)za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    TFLatin	Emoticons )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr   )rz   r{   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r!   ry   ry     sN    /"9/)/!g&@o%)G)8)>)>*S! ' 00!!	  	
	

 	33 ' +O#u'? 0?"h/&AO#u'?m+-/O 	 E_$<3377O+}/Oo%O)Cr$   i   )maxsizedecoded_sequencemaximum_thresholddebugc                    t         j                         D cg c]	  } |        }}t        |       }d}|dk  rd}n
|dk  rd}nd}t        | t	        d|            D ]o  \  }}	|D ]%  }
|
j                  |      s|
j                  |       ' |	dkD  r|	|z  dk(  s	|	|dz
  k(  sFt        |D cg c]  }|j                   c}      }||k\  so n |r'|D ]"  }t        |j                  |j                         $ t        |d	      S c c}w c c}w )
zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    rJ   i       i   r      r   r      )r   __subclasses__r   zipranger"   r&   sumr+   print	__class__round)r   r   r   md_class	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectordts               r!   
mess_ratior     s(    $6#D#D#F#Fx
#F   !"FO|,.)	4,.),/) 0%62BC	5!H  +i( "
 AI%"CCqHfqj !i"@i288i"@AO"33 D B",,)  !$$A. #As   D+D
N)g?F)&	functoolsr   typingr   r   constantr   r   utilsr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r7   rN   r[   rh   rs   r   r   r   r1   r2   ry   r4   r   r5   r$   r!   <module>r      s    ! S    &" "D,L'9 ,L^O1 O8E* E8!D&8 !DH3/( 3/lMA- MA`B- B>IP0 IPX;c];5=c];	;| 4IN'%'%.3'%BF'%
'% '%r$   