
    cCi*                         S SK r S SKJr  S SKJr  S SKJr  SSKJrJ	r	  SSK
Jr  SSKJrJr  \" 5       (       a  S	S
KJr  OSr\R"                  " \5      rSSS.r/ SQr " S S\5      rS/rg)    N)copyfile)Optional)
processors   )
AddedTokenBatchEncoding)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )MBartTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                     ^  \ rS rSr% Sr\rSS/r\r	/ r
\\   \S'   / r\\   \S'               S!U 4S jjr\S	\4S
 j5       r\R&                  S\S	S4S j5       r S"S\\   S\\\      S	\\   4S jjr S"S\\   S\\\      S	\\   4S jjrS\S\\   S\\   4S jr   S#S\\   S\S\\\      S\S	\4
U 4S jjjrS rS rS$S jrS\S	S4S jrS"S\S\\   S	\\   4S jjrS r U =r!$ )%MBartTokenizerFast*   u  
Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.

Examples:

```python
>>> from transformers import MBartTokenizerFast

>>> tokenizer = MBartTokenizerFast.from_pretrained(
...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
```	input_idsattention_maskprefix_tokenssuffix_tokensNc                   > [        U	[        5      (       a  [        U	SSS9OU	n	[        R	                  5       nUb)  UR                  U Vs/ s H  oU;  d  M
  UPM     sn5        [        TU ]  " SUUUUUUUUU	U
UUS.UD6  Xl        [         Vs0 s H  nUU R                  U5      _M     snU l
        U
b  U
OSU l        U R                  U R                  5      U l        Xl        U R                  U R                  5        g s  snf s  snf )NTF)lstriprstrip)r   r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensr    )
isinstancestrr   FAIRSEQ_LANGUAGE_CODEScopyextendsuper__init__r   convert_tokens_to_idslang_code_to_id	_src_langcur_lang_coder;   set_src_lang_special_tokens)selfr   r   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   kwargs_additional_special_tokenst	lang_code	__class__s                    k/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/mbart/tokenization_mbart_fast.pyrD   MBartTokenizerFast.__init__I   s"   " KUU_adJeJeZ
4Fku
%;%@%@%B"$0&--5]5qB\9\5] 	 	
!)!&@	
 	
  %Nd 
NdIt11)<<Nd 
 &.%9w!77G ((87 ^( 
s   
	DDDreturnc                     U R                   $ N)rG   rJ   s    rP   r:   MBartTokenizerFast.src_lang~   s    ~~    new_src_langc                 F    Xl         U R                  U R                   5        g rT   )rG   rI   )rJ   rX   s     rP   r:   rV      s    %((8rW   token_ids_0token_ids_1c                 ~    Uc  U R                   U-   U R                  -   $ U R                   U-   U-   U R                  -   $ )a"  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. The special tokens depend on calling set_lang.

An MBART sequence has the following format, where `X` represents the sequence:

- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.

Args:
    token_ids_0 (`list[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)r.   r/   )rJ   rZ   r[   s      rP    build_inputs_with_special_tokens3MBartTokenizerFast.build_inputs_with_special_tokens   sG    0 %%3d6H6HHH!!K/+=@R@RRRrW   c                     U R                   /nU R                  /nUc  [        XA-   U-   5      S/-  $ [        XA-   U-   U-   U-   U-   5      S/-  $ )a{  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`list[int]`):
        List of IDs.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of zeros.

r   )sep_token_idcls_token_idlen)rJ   rZ   r[   sepclss        rP   $create_token_type_ids_from_sequences7MBartTokenizerFast.create_token_type_ids_from_sequences   si    $   !  !s(3./1#553$s*S0;>DEKKrW   return_tensorsr:   r;   c                 v    Ub  Uc  [        S5      eX0l        U " U4SUS.UD6nU R                  U5      nXvS'   U$ )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensrg   forced_bos_token_id)
ValueErrorr:   rE   )rJ   
raw_inputsrg   r:   r;   extra_kwargsinputstgt_lang_ids           rP   _build_translation_inputs,MBartTokenizerFast._build_translation_inputs   sU     x/`aa jiT.i\hi00:(3$%rW   	src_texts	tgt_textsc                 >   > X l         X@l        [        TU ]  " X40 UD6$ rT   )r:   r;   rC   prepare_seq2seq_batch)rJ   rr   r:   rs   r;   rK   rO   s         rP   ru   (MBartTokenizerFast.prepare_seq2seq_batch   s$     ! w,YLVLLrW   c                 8    U R                  U R                  5      $ rT   )rI   r:   rU   s    rP   _switch_to_input_mode(MBartTokenizerFast._switch_to_input_mode       //>>rW   c                 8    U R                  U R                  5      $ rT   )set_tgt_lang_special_tokensr;   rU   s    rP   _switch_to_target_mode)MBartTokenizerFast._switch_to_target_mode   rz   rW   c                    U R                  U5      U l        / U l        U R                  U R                  /U l        U R                  U R                  5      nU R                  U R                  5      n[        R                  " US/-   U-   USS/-   U-   [        [        X#-   U R                  U R                  -   5      5      S9U R                  l        g)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].$A$Bsinglepairspecial_tokensNrE   rH   r.   eos_token_idr/   convert_ids_to_tokensr   TemplateProcessinglistzip
_tokenizerpost_processor)rJ   r:   prefix_tokens_strsuffix_tokens_strs       rP   rI   .MBartTokenizerFast.set_src_lang_special_tokens   s    !77A"//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$5$I4K]K]`d`r`rKr st*
&rW   langc                    U R                  U5      U l        / U l        U R                  U R                  /U l        U R                  U R                  5      nU R                  U R                  5      n[        R                  " US/-   U-   USS/-   U-   [        [        X#-   U R                  U R                  -   5      5      S9U R                  l        g)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].r   r   r   Nr   )rJ   r   r   r   s       rP   r|   .MBartTokenizerFast.set_tgt_lang_special_tokens   s    !77="//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$5$I4K]K]`d`r`rKr st*
&rW   save_directoryfilename_prefixc                    U R                   (       d  [        S5      e[        R                  R	                  U5      (       d  [
        R                  SU S35        g [        R                  R                  X(       a  US-   OS[        S   -   5      n[        R                  R                  U R                  5      [        R                  R                  U5      :w  a  [        U R                  U5        U4$ )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )can_save_slow_tokenizerrk   ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )rJ   r   r   out_vocab_files       rP   save_vocabulary"MBartTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,,LL,^,<<TUVo_s22QbcoQpp
 77??4??+rww~/NNT__n5  rW   )rG   rH   rF   r.   r:   r/   r;   r   )NN<s></s>r   r   z<unk>z<pad>z<mask>NNNrT   )r   Nr#   )rR   N)"__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr   slow_tokenizer_classr.   r   int__annotations__r/   rD   propertyr?   r:   setterr   r]   re   rp   r   ru   rx   r}   rI   r|   tupler   __static_attributes____classcell__)rO   s   @rP   r*   r*   *   s   . *$&67)!M49!!M49! "&39j #   __9S 9T 9 9
 JNS9S3;DI3FS	cS< JNL9L3;DI3FL	cL2
*-
9A#
RZ[^R_
  )-
M9
M 
M DI&	
M
 
M 

M 
M??

 
 
!c !HSM !]bcf]g ! !rW   r*   )r   shutilr   typingr   
tokenizersr   tokenization_utilsr   r   tokenization_utils_fastr	   utilsr
   r   tokenization_mbartr   
get_loggerr   r   r   r@   r*   __all__r=   rW   rP   <module>r      sr     
   ! ; > 8 2N 
		H	% $=P`a  { `!0 `!F  
 rW   