
    h"                        S SK Jr  S SKrS SKrS SKrS SKJr   S SKJr  S SK
r
S SKJr  S SKJr  \(       a  S SKJr  \R                   " \5      r " S S	\5      rg! \ a	    S SK	Jr   NIf = f)
    )annotationsN)TYPE_CHECKING)Self)AutoTokenizer)InputModule)PreTrainedTokenizerc                    ^  \ rS rSr% SrS/rS\S'     S     SU 4S jjjrSS jrSS	.SS
 jjr	\
    S           SS jj5       r\
     S             SS jj5       rSS jrSS jr S     SS jjrSrU =r$ )SparseStaticEmbedding   a  
SparseStaticEmbedding module for efficient sparse representations.

This lightweight module computes sparse representations by mapping input tokens to static weights,
such as IDF (Inverse Document Frequency) weights. It is designed to encode queries or documents
into fixed-size embeddings based on the presence of tokens in the input.

A common scenario is to use this module for encoding queries, and using a heavier module like
SPLADE (MLMTransformer + SpladePooling) for document encoding.

Args:
    tokenizer (PreTrainedTokenizer): PreTrainedTokenizer to tokenize input texts into input IDs.
    weight (torch.Tensor | None): Static weights for vocabulary tokens (e.g., IDF weights),
        shape should be (vocab_size,). If None, initializes weights to a vector of ones.
        Default is None.
    frozen (bool): Whether the weights should be frozen (not trainable). Default is False.
frozenz	list[str]config_keysc                  > [         TU ]  5         Xl        Ub(  [        R                  R                  X#(       + S9U l        O][        R                  R                  [        R                  " [        U R                  R                  5       5      5      U(       + S9U l        X0l
        U R                  R                  S5      U l        U R                  R                  U l        g )N)requires_gradr   )super__init__	tokenizertorchnn	Parameterweightoneslen	get_vocabr   sizenum_dimensionsmodel_max_lengthmax_seq_length)selfr   r   r   	__class__s       {/home/james-whalen/.local/lib/python3.13/site-packages/sentence_transformers/sparse_encoder/models/SparseStaticEmbedding.pyr   SparseStaticEmbedding.__init__-   s     	"((,,V:,NDK((,,UZZDNN<T<T<V8W-Xlrhr,sDK"kk..q1"nn==    c                "   US   nUS   nUR                  SS 5      nUR                  S   n[        R                  " XPR                  UR
                  [        R                  S9nUR                  SX#5        X`R                  -  nUb  Xt-  nXqS'   U$ )N	input_idsattention_masksentence_embeddingr   )devicedtype   )	getshaper   zerosr   r'   int64scatter_r   )r   featuresr$   r%   r&   
batch_sizetoken_presence
embeddingss           r    forwardSparseStaticEmbedding.forward?   s    [)	!"23%\\*>E__Q'
 Z1D1DYM]M]ejepepq 	9= $kk1
 )#8J)3%&r"   Tsafe_serializationc               f    U R                  U5        U R                  XS9  U R                  U5        g )Nr5   )save_tokenizersave_torch_weightssave_config)r   output_pathr6   argskwargss        r    saveSparseStaticEmbedding.saveU   s/    K(S%r"   c           	     r   [         R                  R                  U5      (       d   SSKJn  U" USUUUUS9n[        U5       n	[        R                  " U	5      n
SSS5        [        W
R                  5       6 u  pUR                  [        U5      5      n[        R                  " U[        R                  S9n[!        U5      S	-   n[        R"                  " U[        R                  S9n[        X5       H  u  nnUUU'   M     U " SXS
.UD6$ ! [
         a    [        SU S35      ef = f! , (       d  f       N= f)a  
Create an SparseStaticEmbedding module from a JSON file containing token to IDF weight mappings.

Args:
    json_path (str): Path to the JSON file containing token to IDF weight mappings.
    tokenizer (PreTrainedTokenizer): Tokenizer to use for converting tokens to IDs.
    token (bool | str | None): Token for Hugging Face authentication
    cache_folder (str | None): Cache folder for Hugging Face
    revision (str | None): Model revision
    local_files_only (bool): Whether to only load local files
    **config: Additional configuration options for the IDF model.

Returns:
    SparseStaticEmbedding: An initialized SparseStaticEmbedding model.
r   )hf_hub_downloadzidf.json)repo_idfilenametoken	cache_dirrevisionlocal_files_onlyzIDF JSON file not found at z. Please provide a valid path.N)r(   r)   r   r    )ospathexistshuggingface_hubrA   
ValueErroropenjsonloadzipitemsconvert_tokens_to_idslistr   tensorfloat32maxr,   )cls	json_pathr   rD   cache_folderrF   rG   configrA   fInidftokensweights	token_idsmax_token_idr   token_idws                     r    	from_jsonSparseStaticEmbedding.from_jsonZ   s   4 ww~~i((j;+%'*%%5	 )_))C.C  syy{+33DLA	,,wemm<9~)\?y2KHa F8 3 @&@@@  j #>ykIg!hiij _s   D D(D%(
D6c           
     (   U R                  UUUUUUS9n[        R                  " UUUUUUS9n	UR                  SS5      n
U
b/  U
R	                  S5      (       a  U R
                  " U
U	4UUUUS.UD6$ U " S	SU	S.UD6nU R                  UUUUUUUS9nU$ )
a/  
Load the SparseStaticEmbedding module with its tokenizer.

Args:
    model_name_or_path (str): Path to the directory containing the saved model.
    subfolder (str): Subfolder within the model directory
    token (bool | str | None): Token for Hugging Face authentication
    cache_folder (str | None): Cache folder for Hugging Face
    revision (str | None): Model revision
    local_files_only (bool): Whether to only load local files
    **kwargs: Additional keyword arguments

Returns:
    SparseStaticEmbedding: The loaded SparseStaticEmbedding module.
)model_name_or_path	subfolderrD   r[   rF   rG   )ri   rD   rE   rF   rG   rK   Nz.json)rD   rE   rF   rG   rH   )rh   ri   rD   r[   rF   rG   modelrI   )load_configr   from_pretrainedpopendswithre   load_torch_weights)rY   rh   ri   rD   r[   rF   rG   r=   r\   r   rK   rj   s               r    rQ   SparseStaticEmbedding.load   s    4 1%- ! 
 "11"-
	 zz&$'g 6 6== &!!1   ?49??&&1%- ' 
 r"   c                    SU R                   R                  R                   3nSU R                  5        SU R                   U S3$ )Nz, tokenizer=zSparseStaticEmbedding(z, dim=))r   r   __name__get_config_dictr   )r   tokenizer_infos     r    __repr__SparseStaticEmbedding.__repr__   sL    '(@(@(I(I'JK'(<(<(>'?vdFYFYEZ[iZjjkllr"   c                    U R                   $ )N)r   )r   s    r     get_sentence_embedding_dimension6SparseStaticEmbedding.get_sentence_embedding_dimension   s    """r"   c           
     8    [        U R                  XSSSS95      $ )NTptF)padding
truncationreturn_tensorsadd_special_tokens)dictr   )r   textsr}   s      r    tokenizeSparseStaticEmbedding.tokenize   s'     NN5dSWlqNr
 	
r"   )r   r   r   r   r   )NF)r   r   r   ztorch.Tensor | Noner   bool)r/   dict[str, torch.Tensor]returnr   )r;   strr6   r   r   None)NNNF)rZ   r   r   r   rD   bool | str | Noner[   
str | NonerF   r   rG   r   ) NNNF)rh   r   ri   r   rD   r   r[   r   rF   r   rG   r   r   r   )r   r   )r   int)T)r   z.list[str] | list[dict] | list[tuple[str, str]]r}   z
str | boolr   r   )rs   
__module____qualname____firstlineno____doc__r   __annotations__r   r3   r>   classmethodre   rQ   rv   ry   r   __static_attributes____classcell__)r   s   @r    r
   r
      sa   $ 'ZK'
 '+	>&> $> 	> >$, HL &
 
 $(#'#!&4A4A '4A !	4A
 !4A 4A 4A 4Al  #'#'#!&DD D !	D
 !D D D 
D DLm# \`
C
NX
	 
 
r"   r
   )
__future__r   rP   loggingrJ   typingr   r   ImportErrortyping_extensionsr   transformersr   (sentence_transformers.models.InputModuler   r   	getLoggerrs   loggerr
   rI   r"   r    <module>r      s_    "   	  '  & @0			8	$L
K L
  '&'s   A A*)A*