
    B                        d dl mZmZ d dlmZmZmZmZ 	 d dlm	Z	 d dlZddlmZmZmZmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZm Z m!Z!  ejD                  d      Z#e#jI                  ejJ                          ejL                         Z'e'jQ                   ejR                  d             e#jU                  e'       	 	 	 	 	 	 	 dde+de,de,de-dee   dee   de.de.defdZ/	 	 	 	 	 	 	 ddede,de,de-dee   dee   de.de.defdZ0	 	 	 	 	 	 	 dde	de,de,de-dee   dee   de.de.defdZ1	 	 	 	 	 	 dde	de,de,de-dee   dee   de.defdZ2y# e
$ r eZ	Y @w xY w)    )basenamesplitext)BinaryIOListOptionalSet)PathLikeN   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDTOO_BIG_SEQUENCETOO_SMALL_SEQUENCE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encoding	iana_nameidentify_sig_or_bomis_cp_similaris_multi_byte_encodingshould_strip_sig_or_bomcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s	sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainreturnc                    t        | t        t        f      s#t        dj	                  t        |                   |s$t        j                  t        j                         n#t        j                  t        j                         t        |       }|dk(  r/t        j                  d       t        t        | dddg d      g      S |?t        j                  d	d
j                  |             |D 	cg c]  }	t!        |	d       }}	ng }|?t        j                  dd
j                  |             |D 	cg c]  }	t!        |	d       }}	ng }|||z  k  rt        j                  d|||       d}|}|dkD  r||z  |k  rt#        ||z        }t        |       t$        k  }
t        |       t&        k\  }|
r%t        j                  dj	                  |             n&|r$t        j)                  dj	                  |             g }|du rt+        |       nd}|'|j-                  |       t        j)                  d|       t/               }g }g }d}d}d}t               }t1        |       \  }}|1|j-                  |       t        j)                  dt        |      |       |j-                  d       d|vr|j-                  d       |t2        z   D ]+  }|r||vr|r||v r||v r|j5                  |       d}||k(  }|xr t7        |      }|dv r|du rt        j)                  d|       ]	 t9        |      }	 |r9|du r5tA        |du r| dt#        d       n| t        |      t#        d       |       ntA        |du r| n| t        |      d |      }d}|D ]  }tG        ||      sd} n |rt        j                  d|       tI        |du rdn
t        |      |t#        ||z              }|xr |duxr t        |      |k  } | rt        j)                  d|       t#        t        |      dz        }!|!dk  rd}!d}"g }#g }$|D ]  }%| |%|%|z    }&|r	|du r||&z   }&|&jK                  |d      }'|rc|%dkD  r^| |%   d k\  rV|d!kD  rd!n|}(|rK|'d|( |vrDtI        |%|%dz
  d"      D ]1  })| |)|%|z    }&|r	|du r||&z   }&|&jK                  |d      }'|'d|( |v s1 n |#j-                  |'       |$j-                  tM        |'|             |$d"   |k\  r|"dz  }"|"|!k\  s|s|du s n |$rtO        |$      t        |$      z  }*nd}*|*|k\  s|"|!k\  rb|j-                  |       t        j                  d#||"tQ        |*d$z  d%&             |dd|fv r"t        | ||dg |      }+||k(  r|+}n
|dk(  r|+}n|+}t        j)                  d'|tQ        |*d$z  d%&             |stS        |      },ntU        |      },|,r.t        j)                  d(j	                  |tA        |,                   g }-|#D ]3  }'tW        |'d)|,rd*j                  |,      nd      }.|-j-                  |.       5 tY        |-      }/|/r%t        j)                  d+j	                  |/|             |j-                  t        | ||*||/|             ||ddfv r,|*d)k  r't        j)                  d,|       t        ||   g      c S ||k(  st        j)                  d-|       t        ||   g      c S  t        |      dk(  r|s|s|rt        j                  d.       |r3t        j                  d/|jZ                         |j-                  |       |S |r||r|r|j\                  |j\                  k7  s|(t        j                  d0       |j-                  |       |S |r&t        j                  d1       |j-                  |       |S c c}	w c c}	w # t:        t<        f$ r t        j?                  d|       Y 4w xY w# tB        tD        f$ rL}t        |tD              s t        j                  d|tA        |             |j-                  |       Y d}~d}~ww xY w)2aD  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    z4Expected object of type bytes or bytearray, got: {0}r   zXGiven content is empty, stopping the process very early, returning empty utf_8 str matchutf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r
   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).Tz@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z[Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %szW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.      ignore)errors      zc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {}g?,z We detected language {} using {}z0%s is most likely the one. Stopping the process.z[%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z#%s will be used as a fallback matchz&utf_8 will be used as a fallback matchz&ascii will be used as a fallback match)/
isinstance	bytearraybytes	TypeErrorformattypeloggersetLevelloggingCRITICALINFOlenwarningr   r   joinr   intr   r   infor   appendsetr   r   addr   r   ModuleNotFoundErrorImportErrordebugstrUnicodeDecodeErrorLookupErrorr   rangedecoder   sumroundr   r   r   r   r+   fingerprint)0r   r   r   r   r    r!   r"   r#   lengthcpis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failurefallback_asciifallback_u8fallback_specifiedresultssig_encodingsig_payloadencoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderesimilar_soft_failure_testencoding_soft_failedr_multi_byte_bonusmax_chunk_gave_upearly_stop_count	md_chunks	md_ratiosicut_sequencechunkchunk_partial_size_chkjmean_mess_ratiofallback_entrytarget_languages	cd_ratioschunk_languagescd_ratios_mergeds0                                                   1platform/bq/third_party/charset_normalizer/api.py
from_bytesr~   &   sS	   2 i)U!34BIIY
 	
 (()%^F{f	
 |IwUBPRSTUU5IIl#	

 8DD|	"e,|D6IIl#	

 8DD|	"e,|D*u$%l		
 
qyVe^j0%(
	N-??	N.>>LSS	

 
W^^	
  .BT-Iy)t  %$$%78N	

 UF  NKG 3I >L+$$\2W	
   )++$$W-.?M=M\9F"

=!+}</ 
4K5
 005IU5RKKm 	$:=$I!	$)>%)G'50 kD	*"3{#3c$i@*	 #&'50 "3{#3#56*	#  %*!$; ],@A,0) %<
 %NNi$
 %.AC4D
 " .t+.O$v- 	 KK-  B!,q  !		A$QZ8L#(8E(A*\9 ''h'GE %Q9Q<43G %r/Bz '
 $556oM"1a!eR0'0Q^'D/4D4M+6+EL , 3 3M( 3 S !8"89_L! 1 U#Zy9:}	) A%  $55$)9U)BS V !)ns9~=O!Oi'+;?P+P#**=9NN0 o+Q7 '3E FF!-}iO" !$66)7&"g-%3N"0KK/C'3	
 %1-@4]CKK8??!3'7#8 	E-s:JCHH%56PTO _-  2)<KK299$m 	$ 		
 0'7CC#%KKBM "7=#9":;;L(KKm "7=#9":;;U @X 7|q.,>NNa NN57I7R7R NN-.  N ^3"++~/I/II'NNCDNN;'
 N	 NNCDNN>*No
 E EF $[1 	LLDm 		* #K0 	a-O!F
 $**=9	s8   -_0_	 _,A_:%_76_7:a	Aaafpc           
      B    t        | j                         |||||||      S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )r~   read)r   r   r   r   r    r!   r"   r#   s           r}   from_fpr     s/     
		 	    pathc                 j    t        | d      5 }t        ||||||||      cddd       S # 1 sw Y   yxY w)z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr   )	r   r   r   r   r    r!   r"   r#   r   s	            r}   	from_pathr     s<     
dD	R 	
 
		s   )2c           
         t        | ||||||      }t        |       }t        t        |            }	t	        |      dk(  rt        dj                  |            |j                         }
|	dxx   d|
j                  z   z  cc<   t        dj                  t        |       j                  |dj                  |	                  d      5 }|j                  |
j                                ddd       |
S # 1 sw Y   |
S xY w)zi
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    r   z;Unable to normalize "{}", no encoding charset seems to fit.-z{}r'   wbN)r   r   listr   rB   IOErrorr;   bestr+   r   rM   replacerD   writeoutput)r   r   r   r   r    r!   r"   ra   filenametarget_extensionsresultr   s               r}   	normalizer     s     G ~HXh/0
7|qIPP
 	
 \\^FaC&//11	CI%%h8I0JKLd
	
!

 M

 Ms    C00C:)      皙?NNTF)r   r   r   NNT)3os.pathr   r   typingr   r   r   r   osr	   rK   rM   r?   cdr   r   r   r   constantr   r   r   mdr   modelsr   r   utilsr   r   r   r   r   r   	getLoggerr=   r>   DEBUGStreamHandlerhandlersetFormatter	Formatter
addHandlerr9   rE   floatboolr~   r   r   r    r   r}   <module>r      s   & 0 0   K J  0  
		/	0  
'


!   &W&&'RS T   ' 
 ""!%JJJ J 	J
 s)J s)J J J J^ ""!%  	
 s) s)   8 ""!%



 
 	

 s)
 s)
 
 
 
: ""!%)
)) ) 	)
 s)) s)) ) )C  Hs   E EE