
    +h2                    ^   S SK Jr  S SKrS SKrS SKJrJr  S SKJr  S SK	J
r
  S SKJrJrJrJrJrJrJr  S SKJrJr  S SKJr  \(       a  S S	KJrJrJr  S S
KJr  \R<                  " \5      r \" SSS9r! " S S\\5      r" " S S\"5      r# " S S\$\
5      r%\" SS9 " S S5      5       r&SS jr'g)    )annotationsN)ABCabstractmethod)	dataclass)Enum)TYPE_CHECKINGAnyCallableLiteralOptionalTypeVarUnion)BaseDocumentTransformerDocument)Self)
CollectionIterableSequence)SetTSTextSplitter)boundc                     \ rS rSrSrSS\SSS4             SS jjr\SS j5       r S     SS
 jjr	SS jr
SS jrSS jr\SS j5       r\SS	\" 5       S4           SS jj5       r      SS jrSrg	)r      z)Interface for splitting text into chunks.i     FTc                    US::  a  SU 3n[        U5      eUS:  a  SU 3n[        U5      eX!:  a  SU SU S3n[        U5      eXl        X l        X0l        X@l        XPl        X`l        g)a  Create a new TextSplitter.

Args:
    chunk_size: Maximum size of chunks to return
    chunk_overlap: Overlap in characters between chunks
    length_function: Function that measures the length of given chunks
    keep_separator: Whether to keep the separator and where to place it
                    in each corresponding chunk (True='start')
    add_start_index: If `True`, includes chunk's start index in metadata
    strip_whitespace: If `True`, strips whitespace from the start and end of
                      every document
r   zchunk_size must be > 0, got z chunk_overlap must be >= 0, got zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)self
chunk_sizechunk_overlaplength_functionkeep_separatoradd_start_indexstrip_whitespacemsgs           M/root/34ku/venv/lib/python3.13/site-packages/langchain_text_splitters/base.py__init__TextSplitter.__init__!   s    * ?0=CS/!14]ODCS/!%.}o ><46  S/!%+ /- /!1    c                    g)z$Split text into multiple components.N )r$   texts     r,   
split_textTextSplitter.split_textI   s    r/   Nc           	        U=(       d    0 /[        U5      -  n/ n[        U5       H  u  pVSnSnU R                  U5       H  n	[        R                  " X5   5      n
U R
                  (       a<  Xx-   U R                  -
  nUR                  U	[        SU5      5      nXzS'   [        U	5      n[        XS9nUR                  U5        M     M     U$ )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater3   copydeepcopyr"   r   findmaxr   append)r$   texts	metadatas
_metadatas	documentsir2   indexprevious_chunk_lenchunkr8   offsetnew_docs                r,   create_documentsTextSplitter.create_documentsM   s     32$U"3
	 'GAE!".==7(("7$:M:MMF IIeSF^<E.3]+),U&"I  ) / ( r/   c                    / / p2U H9  nUR                  UR                  5        UR                  UR                  5        M;     U R                  X#S9$ )zSplit documents.)rA   )r?   r7   r8   rJ   )r$   rC   r@   rA   docs        r,   split_documentsTextSplitter.split_documentsa   sM    ryCLL))*S\\*  $$U$@@r/   c                x    UR                  U5      nU R                  (       a  UR                  5       nUS:X  a  g U$ )N )joinr#   strip)r$   docs	separatorr2   s       r,   
_join_docsTextSplitter._join_docsi   s3    ~~d#!!::<D2:r/   c                x   U R                  U5      n/ n/ nSnU GHv  nU R                  U5      nXh-   [        U5      S:  a  UOS-   U R                  :  Ga  X`R                  :  a%  [        R	                  SU SU R                   35        [        U5      S:  a  U R                  XR5      n	U	b  UR                  U	5        X`R                  :  d,  Xh-   [        U5      S:  a  UOS-   U R                  :  at  US:  an  X`R                  US   5      [        U5      S:  a  UOS-   -  nUSS  nX`R                  :  a  M@  Xh-   [        U5      S:  a  UOS-   U R                  :  a  US:  a  Mn  UR                  U5        Xh[        U5      S:  a  UOS-   -  nGMy     U R                  XR5      n	U	b  UR                  U	5        U$ )Nr   zCreated a chunk of size z%, which is longer than the specified    )r    r9   r   loggerwarningrV   r?   r   )
r$   splitsrU   separator_lenrT   current_doctotald_lenrM   s
             r,   _merge_splitsTextSplitter._merge_splitsq   s    --i8!#A((+D[1AA1E1M""# +++NN25' :>>B>N>N=OQ {#a'//+ACC(  "5"55[9IA9MSTU**+!AI!6!6{1~!F-0-=-AMq"  '2!"o  "5"55[9IA9MSTU**+!AI q!c+.>.B]JJE9 : ook5?KKr/   c                   ^  SSK Jn  [        TU5      (       d  Sn[        U5      eSU4S jjnU " S	SU0UD6$ ! [         a  nSn[        U5      UeSnAff = f)
z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBasec                8   > [        TR                  U 5      5      $ N)r9   tokenizer2   	tokenizers    r,   _huggingface_tokenizer_lengthNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    9--d344r/   z`Could not import transformers python package. Please install it with `pip install transformers`.Nr'   r2   strreturnintr1   )$transformers.tokenization_utils_basere   
isinstancer   ImportError)clsrj   kwargsre   r+   rk   errs    `     r,   from_huggingface_tokenizer'TextSplitter.from_huggingface_tokenizer   ss    	+Ti)@AAW  !o%5 K#@KFKK  	+E  S/s*	+s   ,9 
AAAgpt2allc                
  ^^^  SSK nUb  UR                  U5      mOUR                  U5      mSUUU4S jjn	[	        U [
        5      (       a  UUTTS.n
0 UEU
EnU " SSU	0UD6$ ! [         a  nSn[        U5      UeSnAff = f)	z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.c                8   > [        TR                  U TTS95      $ N)allowed_specialdisallowed_special)r9   encode)r2   r~   r   encs    r,   _tiktoken_encoder=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder   s*    

$3'9   r/   )encoding_name
model_namer~   r   r'   rm   r1   )tiktokenrs   encoding_for_modelget_encoding
issubclassTokenTextSplitter)rt   r   r   r~   r   ru   r   rv   r+   r   extra_kwargsr   s      ``      @r,   from_tiktoken_encoder"TextSplitter.from_tiktoken_encoder   s    	, !--j9C''6C	 	 c,--!.(#2&8	L 0/,/F?#4????  	,A 
 c"+	,s   A% %
B/A==Bc                6    U R                  [        U5      5      $ )z2Transform sequence of documents by splitting them.)rN   list)r$   rC   ru   s      r,   transform_documents TextSplitter.transform_documents   s     ##DO44r/   )r"   r   r   r!   r    r#   )r%   rp   r&   rp   r'   zCallable[[str], int]r(   z$Union[bool, Literal['start', 'end']]r)   boolr*   r   ro   Noner2   rn   ro   	list[str]rg   )r@   r   rA   zOptional[list[dict[Any, Any]]]ro   list[Document])rC   zIterable[Document]ro   r   )rT   r   rU   rn   ro   Optional[str])r\   zIterable[str]rU   rn   ro   r   )rj   r	   ru   r	   ro   r   )r   rn   r   r   r~   'Union[Literal['all'], AbstractSet[str]]r   &Union[Literal['all'], Collection[str]]ru   r	   ro   r   )rC   Sequence[Document]ru   r	   ro   r   )__name__
__module____qualname____firstlineno____doc__r9   r-   r   r3   rJ   rN   rV   rb   classmethodrw   setr   r   __static_attributes__r1   r/   r,   r   r      sH   3  03?D %!%&2&2 &2 .	&2
 =&2 &2 &2 
&2P 3 3 MQ+I	(A(T L L,  $$(CF5EJ*@*@ "*@ A	*@
 C*@ *@ 
*@ *@X5+57:5	5r/   c                  f   ^  \ rS rSrSrSS\" 5       S4           S	U 4S jjjrS
S jrSrU =r	$ )r      z/Splitting text to tokens using model tokenizer.ry   Nrz   c                   > [         T
U ]  " S0 UD6   SSKnUb  UR	                  U5      n	OUR                  U5      n	Xl        X0l        X@l        g! [         a  nSn[        U5      UeSnAff = f)zCreate a new TextSplitter.r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.r1   )	superr-   r   rs   r   r   
_tokenizer_allowed_special_disallowed_special)r$   r   r   r~   r   ru   r   rv   r+   r   	__class__s             r,   r-   TokenTextSplitter.__init__   s     	"6"	, !--j9C''6C /#5   	,A 
 c"+	,s   A 
A-A((A-c                   ^  SU 4S jjn[        T R                  T R                  T R                  R                  US9n[        XS9$ )a/  Splits the input text into smaller chunks based on tokenization.

This method uses a custom tokenizer configuration to encode the input text
into tokens, processes the tokens in chunks of a specified size with overlap,
and decodes them back into text chunks. The splitting is performed using the
`split_text_on_tokens` function.

Args:
    text (str): The input text to be split into smaller chunks.

Returns:
    List[str]: A list of text chunks, where each chunk is derived from a portion
    of the input text based on the tokenization and chunking rules.
c                b   > TR                   R                  U TR                  TR                  S9$ r}   )r   r   r   r   )_textr$   s    r,   _encode-TokenTextSplitter.split_text.<locals>._encode  s4    ??)) $ 5 5#'#;#; *  r/   )r&   tokens_per_chunkdecoder   ri   )r   rn   ro   z	list[int])	Tokenizerr   r   r   r   split_text_on_tokens)r$   r2   r   rj   s   `   r,   r3   TokenTextSplitter.split_text  sC     	 --!--??))	
	 $CCr/   )r   r   r   )r   rn   r   r   r~   r   r   r   ru   r	   ro   r   r   )
r   r   r   r   r   r   r-   r3   r   __classcell__)r   s   @r,   r   r      sh    9 $$(CF5EJ66 "6 A	6
 C6 6 
6 68D Dr/   r   c                      \ rS rSrSrSrSrSrSrSr	Sr
S	rS
rSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSrSr g) Languagei&  z"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellvisualbasic6r1   N)!r   r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLVISUALBASIC6r   r1   r/   r,   r   r   &  s    ,
C	BDF	B	B
CEF
CDDEEHED
CFEA
CDGFJ!Lr/   r   T)frozenc                  H    \ rS rSr% SrS\S'    S\S'    S\S'    S\S	'   S
rg)r   iF  zTokenizer data class.rp   r&   r   zCallable[[list[int]], str]r   zCallable[[str], list[int]]r   r1   N)r   r   r   r   r   __annotations__r   r1   r/   r,   r   r   F  s)    *,&&=&&=r/   r   c                   / nUR                  U 5      nSn[        XAR                  -   [        U5      5      nX4U nU[        U5      :  a  UR	                  UR                  U5      5        U[        U5      :X  a   U$ XAR                  UR                  -
  -  n[        XAR                  -   [        U5      5      nX4U nU[        U5      :  a  M  U$ )z6Split incoming text and return chunks using tokenizer.r   )r   minr   r9   r?   r   r&   )r2   rj   r\   	input_ids	start_idxcur_idx	chunk_idss          r,   r   r   T  s    F  &II)888#i.IGG,I
c)n
$i&&y12c)n$ M 	//)2I2III	i"<"<<c)nM0	 c)n
$ Mr/   )r2   rn   rj   r   ro   r   )(
__future__r   r;   loggingabcr   r   dataclassesr   enumr   typingr   r	   r
   r   r   r   r   langchain_core.documentsr   r   typing_extensionsr   collections.abcr   r   r   r   AbstractSet	getLoggerr   rZ   r   r   r   rn   r   r   r   r1   r/   r,   <module>r     s    "   # !    G ">>2			8	$T(E5*C E5P=D =D@"sD "@ $
> 
> 
>r/   