
    bCi/                         S SK r S SKJrJr  S SKrS SKJr  S SKJ	r	J
r
JrJrJr  SSKJr  SSKJr  SSKJr  \" S	S
9 " S S\R&                  R(                  5      5       rS/rg)    N)OptionalUnion)BertTokenizer)FastBertTokenizerShrinkLongestTrimmercase_fold_utf8combine_segmentspad_model_inputs   )keras)requires   )tftensorflow_text)backendsc                      ^  \ rS rSrSr          SS\S\S\\   S\\   S\\   S\	S	\S
\S\\   S\S\S\4U 4S jjjr
\SS j5       r\S\\	\R                  4   4S j5       rS r       SS jrS rSrU =r$ )TFBertTokenizer   a@  
This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
`from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
from an existing standard tokenizer object.

In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
straight from `tf.string` inputs to outputs.

Args:
    vocab_list (`list`):
        List containing the vocabulary.
    do_lower_case (`bool`, *optional*, defaults to `True`):
        Whether or not to lowercase the input when tokenizing.
    cls_token_id (`str`, *optional*, defaults to `"[CLS]"`):
        The classifier token which is used when doing sequence classification (classification of the whole sequence
        instead of per-token classification). It is the first token of the sequence when built with special tokens.
    sep_token_id (`str`, *optional*, defaults to `"[SEP]"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    pad_token_id (`str`, *optional*, defaults to `"[PAD]"`):
        The token used for padding, for example when batching sequences of different lengths.
    padding (`str`, defaults to `"longest"`):
        The type of padding to use. Can be either `"longest"`, to pad only up to the longest sample in the batch,
        or `"max_length", to pad all inputs to the maximum length supported by the tokenizer.
    truncation (`bool`, *optional*, defaults to `True`):
        Whether to truncate the sequence to the maximum length.
    max_length (`int`, *optional*, defaults to `512`):
        The maximum length of the sequence, used for padding (if `padding` is "max_length") and/or truncation (if
        `truncation` is `True`).
    pad_to_multiple_of (`int`, *optional*, defaults to `None`):
        If set, the sequence will be padded to a multiple of this value.
    return_token_type_ids (`bool`, *optional*, defaults to `True`):
        Whether to return token_type_ids.
    return_attention_mask (`bool`, *optional*, defaults to `True`):
        Whether to return the attention_mask.
    use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`):
        If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer
        class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to
        TFLite.

vocab_listdo_lower_casecls_token_idsep_token_idpad_token_idpadding
truncation
max_lengthpad_to_multiple_ofreturn_token_type_idsreturn_attention_maskuse_fast_bert_tokenizerc                   > [         TU ]  5         U(       a#  [        U4[        R                  US.UD6U l        O[        R                  R                  [        R                  R                  U[        R                  [        R                  " [        R                  " U[        R                  S9[        R                  S9[        R                  S9SS9n[        U4[        R                  US.UD6U l        Xl        X l        Uc  UR                  S5      OUU l        Uc  UR                  S	5      OUU l        Uc  UR                  S
5      OUU l        ['        US-
  SS9U l        Xl        X`l        Xpl        Xl        Xl        Xl        g )N)token_out_typelower_case_nfd_strip_accents)out_type)dtype)keys	key_dtypevaluesvalue_dtyper   )num_oov_buckets)r"   
lower_casez[CLS]z[SEP]z[PAD]r   axis)super__init__r   r   int64tf_tokenizerlookupStaticVocabularyTableKeyValueTensorInitializerstringrangesizeBertTokenizerLayerr   r   indexr   r   r   r   paired_trimmerr   r   r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r    tokenizer_kwargslookup_table	__class__s                  g/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/bert/tokenization_bert_tf.pyr/   TFBertTokenizer.__init__;   s`     	" 1!+-88R_!cs!D 99::		33# ii88BGGJ$JRTRZRZ[ "	 4  !" ; L !3!-/XX-!Sc!D %*9E9MJ,,W5S_9E9MJ,,W5S_9E9MJ,,W5S_2:>J$$"4%:"%:"    c           	         UR                  SS5      nUc  UR                  OUnUR                  SS5      nUc  UR                  OUnUR                  SS5      nUc  UR                  OUnUR                  SS5      nUc  UR                  OUnUR                  5       n[        UR                  5       S S9nU Vs/ s H  oS   PM	     n	nU " S
U	UUUUS	.UD6$ s  snf )a  
Initialize a `TFBertTokenizer` from an existing `Tokenizer`.

Args:
    tokenizer (`PreTrainedTokenizerBase`):
        The tokenizer to use to initialize the `TFBertTokenizer`.

Examples:

```python
from transformers import AutoTokenizer, TFBertTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
```
r   Nr   r   r   c                     U S   $ )Nr    )xs    r?   <lambda>0TFBertTokenizer.from_tokenizer.<locals>.<lambda>   s    AaDrA   )keyr   r   r   r   r   r   rD   )popr   r   r   r   	get_vocabsorteditems)
cls	tokenizerkwargsr   r   r   r   vocabentryr   s
             r?   from_tokenizerTFBertTokenizer.from_tokenizerk   s    $ 

?D93@3H	//mzz.$71=1Ey--<zz.$71=1Ey--<zz.$71=1Ey--<##%u{{}.9,12E5AhE
2 
!'%%%
 
 	
 3s   :Cpretrained_model_name_or_pathc                      [         R                  " U/UQ70 UD6nU R                  " U40 UD6$ !   SSKJn  UR                  " U/UQ70 UD6n N3= f)aM  
Instantiate a `TFBertTokenizer` from a pre-trained tokenizer.

Args:
    pretrained_model_name_or_path (`str` or `os.PathLike`):
        The name or path to the pre-trained tokenizer.

Examples:

```python
from transformers import TFBertTokenizer

tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased")
```
r   )BertTokenizerFast)r   from_pretrainedtokenization_bert_fastrW   rS   )rN   rU   init_inputsrP   rO   rW   s         r?   rX   TFBertTokenizer.from_pretrained   se    "	q%556SlValeklI
 !!)6v66		qA)99:WpZepiopIs	   / Ac                     U R                   (       a  [        U5      nU R                  R                  U5      nUR	                  SS5      $ )Nr   )r   r   r1   tokenize
merge_dims)r;   textstokenss      r?   unpaired_tokenize!TFBertTokenizer.unpaired_tokenize   s>    "5)E""++E2  B''rA   c	                    Uc  U R                   nUS;  a  [        S5      eUb  Ub  [        S5      eUc  U R                  nUc  U R                  nUc  U R                  nUc  U R
                  nUc  U R                  n[        U[        R                  5      (       d  [        R                  " U5      nUb5  [        U[        R                  5      (       d  [        R                  " U5      nUbJ  UR                  R                  S:  a  [        S5      eUR                  R                  S:  a  [        S5      eUR                  R                  S:X  a  US S 2S4   US S 2S4   p!U R                  U5      nUc8  U(       a  US S 2S US-
  24   n[        U4U R                  U R                   S	9u  pOXU R                  U5      nU(       a  U R"                  R%                  X/5      u  p[        X4U R                  U R                   S	9u  pUS
:X  a8  U	R'                  SS9nUb%  U[        R(                  R+                  U* U5      * -  nOUn[-        XU R.                  S9u  pSU	0nU(       a  XS'   U(       a  [-        XU R.                  S9u  pXS'   U$ )N)longestr   z1Padding must be either 'longest' or 'max_length'!zJmax_length cannot be overridden at call time when truncating paired texts!r   zJtext argument should not be multidimensional when a text pair is supplied!z)text_pair should not be multidimensional!   r   )start_of_sequence_idend_of_segment_idre   r,   )max_seq_length	pad_value	input_idsattention_masktoken_type_ids)r   
ValueErrorr   r   r   r   r   
isinstancer   Tensorconvert_to_tensorshaperankrb   r	   r   r   r:   trimbounding_shapemathfloordivr
   r   )r;   text	text_pairr   r   r   r   r   r   rk   rm   
pad_lengthrl   output_s                  r?   callTFBertTokenizer.call   s    ?llG33PQQ!i&;ijjJJ%!%!8!8 ($($>$>! ($($>$>!$		**''-D Iryy)I)I,,Y7I zz" !mnn##a' !LMM::??a"1a4j$q!t*)%%d+A/a//0(8d.?.?SWSdSd)%I~ ..y9I"&"5"5":":D;L"M(8!8I8I]a]n]n)%I i"11q19J!-/BGG4D4Dj[Rd4e3ef
#J$4Yeievev$w!	y) '5#$  0TEVEV!N (6#$rA   c                 v    U R                   U R                  U R                  U R                  U R                  S.$ )NrI   rI   )r;   s    r?   
get_configTFBertTokenizer.get_config   s7    //!// -- -- --
 	
rA   )r   r   r   r   r   r   r:   r   r   r   r1   r   r   )
NNNre   Ti   NTTT)rO   PreTrainedTokenizerBase)NNNNNNN)__name__
__module____qualname____firstlineno____doc__listboolr   intstrr/   classmethodrS   r   osPathLikerX   rb   r}   r   __static_attributes____classcell__)r>   s   @r?   r   r      s%   *` '+&*&* ,0&*&*(,.;.; .; sm	.;
 sm.; sm.; .; .; .; %SM.;  $.;  $.; "&.; .;` $
 $
L 7E#r{{BR<S 7 70( ""FP
 
rA   r   )r   typingr   r   
tensorflowr   r   r   r8   r   r   r   r	   r
   modeling_tf_utilsr   utils.import_utilsr   tokenization_bertlayersLayerr   __all__rD   rA   r?   <module>r      sY    	 "  ? w w & * , 
,-r
ell(( r
 .r
j 
rA   