
    *4                         d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZ  G d d      Z G d d      Zeeef   Z ee    Z! G d d      Z"y)    N)Counter)aliases)sha256)dumps)sub)AnyDictIteratorListOptionalTupleUnion   )NOT_PRINTABLE_PATTERNTOO_BIG_SEQUENCE)
mess_ratio)	iana_nameis_multi_byte_encodingunicode_rangec                   n   e Zd Z	 d)dededededddee   fd	Zd
e	defdZ
d
e	defdZedefd       Zedefd       Zedefd       Zedefd       ZdefdZdefdZd*dZedefd       Zedee   fd       Zedefd       Zedefd       Zedee   fd       Zedefd       Zedefd       Zedefd       Zedefd       Zedefd       Zedefd       Z eded    fd        Z!edefd!       Z"edee   fd"       Z#edee   fd#       Z$d+d$Z%d+d%Z&d,d&edefd'Z'edefd(       Z(y)-CharsetMatchNpayloadguessed_encodingmean_mess_ratiohas_sig_or_bom	languagesCoherenceMatchesdecoded_payloadc                     || _         || _        || _        || _        || _        d | _        g | _        d| _        d | _        d | _	        || _
        y )N        )_payload	_encoding_mean_mess_ratio
_languages_has_sig_or_bom_unicode_ranges_leaves_mean_coherence_ratio_output_payload_output_encoding_string)selfr   r   r   r   r   r   s          4platform/bq/third_party/charset_normalizer/models.py__init__zCharsetMatch.__init__   sW      ) /#-#%("# $&    otherreturnc                    t        |t              sAt        dj                  t	        |j
                        t	        | j
                                    | j                  |j                  k(  xr | j                  |j                  k(  S )Nz&__eq__ cannot be invoked on {} and {}.)
isinstancer   	TypeErrorformatstr	__class__encodingfingerprintr,   r0   s     r-   __eq__zCharsetMatch.__eq__)   si    %.8??(#dnn*= 
 }}.X43C3CuGXGX3XXr/   c                    t        |t              st        t        | j                  |j                  z
        }t        | j
                  |j
                  z
        }|dk  rU|dkD  rP|dk(  r2| j
                  |j
                  k(  r| j                  |j                  kD  S | j
                  |j
                  kD  S | j                  |j                  k  S )zQ
        Implemented to make sorted available upon CharsetMatches items.
        g{Gz?g{Gz?r    )r3   r   
ValueErrorabschaos	coherencemulti_byte_usage)r,   r0   chaos_differencecoherence_differences       r-   __lt__zCharsetMatch.__lt__2   s     %.tzzEKK78"4>>EOO#CD d"';d'B3&4>>U__+L,,u/E/EEE>>EOO33zzEKK''r/   c                 \    dt        t        |             t        | j                        z  z
  S )N      ?)lenr6   rawr,   s    r-   rA   zCharsetMatch.multi_byte_usageE   s"    ST^c$((m333r/   c                 `    t        j                  dt               t        t	        |       d      S )z
        Check once again chaos in decoded text, except this time, with full content.
        Use with caution, this can be very slow.
        Notice: Will be removed in 3.0
        z=chaos_secondary_pass is deprecated and will be removed in 3.0rF   )warningswarnDeprecationWarningr   r6   rI   s    r-   chaos_secondary_passz!CharsetMatch.chaos_secondary_passI   s)     	K	
 #d)S))r/   c                 8    t        j                  dt               y)zy
        Coherence ratio on the first non-latin language detected if ANY.
        Notice: Will be removed in 3.0
        z<coherence_non_latin is deprecated and will be removed in 3.0r    )rK   rL   rM   rI   s    r-   coherence_non_latinz CharsetMatch.coherence_non_latinV   s     	J	
 r/   c                     t        j                  dt               t        t        dt        |       j                               }t        |j                               S )z_
        Word counter instance on decoded text.
        Notice: Will be removed in 3.0
        z2w_counter is deprecated and will be removed in 3.0 )	rK   rL   rM   r   r   r6   lowerr   split)r,   string_printable_onlys     r-   	w_counterzCharsetMatch.w_counterb   sH     	@BT	
 !$$93D	@Q R,22455r/   c                 ~    | j                   &t        | j                  | j                  d      | _         | j                   S )Nstrict)r+   r6   r!   r"   rI   s    r-   __str__zCharsetMatch.__str__p   s.    <<t}}dnnhGDL||r/   c                 N    dj                  | j                  | j                        S )Nz<CharsetMatch '{}' bytes({})>)r5   r8   r9   rI   s    r-   __repr__zCharsetMatch.__repr__v   s    .55dmmTEUEUVVr/   c                     t        |t              r|| k(  r$t        dj                  |j                              d |_        | j                  j                  |       y )Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r3   r   r=   r5   r7   r+   r'   appendr:   s     r-   add_submatchzCharsetMatch.add_submatchy   sO    %.%4-MTTOO  E"r/   c                     | j                   S N)r"   rI   s    r-   r8   zCharsetMatch.encoding   s    ~~r/   c                     g }t        j                         D ]G  \  }}| j                  |k(  r|j                  |       '| j                  |k(  s7|j                  |       I |S )z
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        )r   itemsr8   r]   )r,   also_known_asups       r-   encoding_aliaseszCharsetMatch.encoding_aliases   sY    
 MMODAq}}!$$Q'!#$$Q'	 $
 r/   c                     | j                   S r`   r%   rI   s    r-   bomzCharsetMatch.bom       ###r/   c                     | j                   S r`   rh   rI   s    r-   byte_order_markzCharsetMatch.byte_order_mark   rj   r/   c                 F    | j                   D cg c]  }|d   	 c}S c c}w )z
        Return the complete list of possible languages found in decoded sequence.
        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
        r   r$   )r,   es     r-   r   zCharsetMatch.languages   s$     #oo.o!o...s   c                    | j                   shd| j                  v ryddlm}m} t        | j                        r || j                        n || j                        }t        |      dk(  sd|v ry|d   S | j                   d   d   S )z
        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
        "Unknown".
        asciiEnglishr   )encoding_languagesmb_encoding_languageszLatin BasedUnknown)r$   could_be_from_charsetcharset_normalizer.cdrs   rt   r   r8   rG   )r,   rs   rt   r   s       r-   languagezCharsetMatch.language   s      $444  X *$--8 &dmm4'6  9~"my&@ Q<q!!$$r/   c                     | j                   S r`   )r#   rI   s    r-   r?   zCharsetMatch.chaos   s    $$$r/   c                 @    | j                   sy| j                   d   d   S )Nr    r   r   rn   rI   s    r-   r@   zCharsetMatch.coherence   s     q!!$$r/   c                 6    t        | j                  dz  d      S Nd      )ndigits)roundr?   rI   s    r-   percent_chaoszCharsetMatch.percent_chaos   s    TZZ#%q11r/   c                 6    t        | j                  dz  d      S r|   )r   r@   rI   s    r-   percent_coherencezCharsetMatch.percent_coherence   s    T^^c)155r/   c                     | j                   S )z+
        Original untouched bytes.
        )r!   rI   s    r-   rH   zCharsetMatch.raw   s    
 }}r/   c                     | j                   S r`   )r'   rI   s    r-   submatchzCharsetMatch.submatch   s    ||r/   c                 2    t        | j                        dkD  S Nr   )rG   r'   rI   s    r-   has_submatchzCharsetMatch.has_submatch   s    4<< 1$$r/   c                     | j                   | j                   S t        |       D cg c]  }t        |       }}t        t	        |D ch c]  }|s|	 c}            | _         | j                   S c c}w c c}w r`   )r&   r6   r   sortedlist)r,   chardetected_rangesrs       r-   	alphabetszCharsetMatch.alphabets   s}    +''' -0I
,5DM$I 	 
  &d+L!!A+L&MN###
 ,Ms   A0A5A5c                 p    | j                   g| j                  D cg c]  }|j                   c}z   S c c}w )z
        The complete list of encoding that output the exact SAME str result and therefore could be the originating
        encoding.
        This list does include the encoding available in property 'encoding'.
        )r"   r'   r8   )r,   ms     r-   rv   z"CharsetMatch.could_be_from_charset   s0     t||"D|!1::|"DDD"Ds   3c                     | S z>
        Kept for BC reasons. Will be removed in 3.0.
         rI   s    r-   firstzCharsetMatch.first   	     r/   c                     | S r   r   rI   s    r-   bestzCharsetMatch.best   r   r/   r8   c                     | j                   | j                   |k7  r'|| _         t        |       j                  |d      | _        | j                  S )z
        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
        Any errors will be simply ignored by the encoder NOT replaced.
        replace)r*   r6   encoder)   )r,   r8   s     r-   outputzCharsetMatch.output  sJ    
   (D,A,AX,M$,D!#&t9#3#3Hi#HD ###r/   c                 P    t        | j                               j                         S )zw
        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
        )r   r   	hexdigestrI   s    r-   r9   zCharsetMatch.fingerprint  s    
 dkkm$..00r/   r`   )r0   r   r1   N)r1   r   )utf_8))__name__
__module____qualname__bytesr6   floatboolr   r.   objectr;   rD   propertyrA   rN   rP   r   rV   rY   r[   r^   r8   r   rf   ri   rl   r   rx   r?   r@   r   r   rH   r   r   r   rv   r   r   r   r9   r   r/   r-   r   r      s    *.'' ' 	'
 ' &' "#'2YF Yt Y(F (t (& 4% 4 4 
*e 
* 
* 	U 	 	 67 6 6 W# W	# #   
$s) 
 
 $T $ $ $ $ $ /49 / / %# % %6 %u % % %5 % %
 2u 2 2 65 6 6 U   $~.   %d % % 	$49 	$ 	$ EtCy E E	$s 	$ 	$ 1S 1 1r/   r   c                       e Zd ZdZddee   fdZdee   fdZde	e
ef   defdZde
fd	Zdefd
ZdeddfdZded   fdZded   fdZy)CharsetMatchesz
    Container with every CharsetMatch items ordered by default from most probable to the less one.
    Act like a list(iterable) but does not implements all related methods.
    Nresultsc                 8    |rt        |      | _        y g | _        y r`   )r   _results)r,   r   s     r-   r.   zCharsetMatches.__init__  s    +2wr/   r1   c              #   6   K   | j                   D ]  }|  y wr`   r   )r,   results     r-   __iter__zCharsetMatches.__iter__  s     mmFL $s   itemc                     t        |t              r| j                  |   S t        |t              r/t	        |d      }| j                  D ]  }||j
                  v s|c S  t        )z
        Retrieve a single item either by its position or encoding name (alias may be used here).
        Raise KeyError upon invalid index or encoding not present in results.
        F)r3   intr   r6   r   rv   KeyError)r,   r   r   s      r-   __getitem__zCharsetMatches.__getitem__#  s[    
 dC ==&&dC T5)D--6777!M ( r/   c                 ,    t        | j                        S r`   rG   r   rI   s    r-   __len__zCharsetMatches.__len__1  s    4==!!r/   c                 2    t        | j                        dkD  S r   r   rI   s    r-   __bool__zCharsetMatches.__bool__4  s    4==!A%%r/   c                    t        |t              s-t        dj                  t	        |j
                                    t        |j                        t        k  rW| j                  D ]H  }|j                  |j                  k(  s|j                  |j                  k(  s7|j                  |        y | j                  j                  |       t        | j                        | _	        y)z~
        Insert a single match. Will be inserted accordingly to preserve sort.
        Can be inserted as a submatch.
        z-Cannot append instance '{}' to CharsetMatchesN)r3   r   r=   r5   r6   r7   rG   rH   r   r   r9   r?   r^   r]   r   )r,   r   matchs      r-   r]   zCharsetMatches.append7  s    
 $-?FF'  txx=,,$$(8(88U[[DJJ=V&&t, ' 	T"t}}-r/   r   c                 :    | j                   sy| j                   d   S )zQ
        Simply return the first match. Strict equivalent to matches[0].
        Nr   r   rI   s    r-   r   zCharsetMatches.bestK  s     }}}}Qr/   c                 "    | j                         S )zP
        Redundant method, call the method best(). Kept for BC reasons.
        )r   rI   s    r-   r   zCharsetMatches.firstS  s     yy{r/   r`   )r   r   r   __doc__r   r   r.   r
   r   r   r   r6   r   r   r   r   r]   r   r   r   r   r/   r-   r   r     s    
;\ 2 ;(<0 c3h L " "&$ &.< .D .( h~.  x/ r/   r   c                       e Zd Zdedee   dee   dee   dedee   deded	ed
ee   defdZe	de
eef   fd       ZdefdZy)CliDetectionResultpathr8   rf   alternative_encodingsrx   r   r   r?   r@   unicode_pathis_preferredc                     || _         |
| _        || _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        y r`   )r   r   r8   rf   r   rx   r   r   r?   r@   r   )r,   r   r8   rf   r   rx   r   r   r?   r@   r   r   s               r-   r.   zCliDetectionResult.__init___  sV     	(  0%:" ",
"(r/   r1   c                     | j                   | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  dS )Nr   r8   rf   r   rx   r   r   r?   r@   r   r   r   rI   s    r-   __dict__zCliDetectionResult.__dict__y  se     II $ 5 5%)%?%?"11ZZ -- --
 	
r/   c                 2    t        | j                  dd      S )NT   )ensure_asciiindent)r   r   rI   s    r-   to_jsonzCliDetectionResult.to_json  s    T]]a@@r/   N)r   r   r   r6   r   r   r   r   r.   r   r	   r   r   r   r   r/   r-   r   r   ^  s    )) 3-) s)	)
  $Cy) ) 9) ) ) ) sm) )4 
$sCx. 
 
A Ar/   r   )#rK   collectionsr   encodings.aliasesr   hashlibr   jsonr   rer   typingr   r	   r
   r   r   r   r   constantr   r   mdr   utilsr   r   r   r   r   r6   r   CoherenceMatchr   r   r   r/   r-   <module>r      sm      %    D D D =  C CD1 D1NA AH sEz"' ,A ,Ar/   