ó
    — óhK  ã                  ób   • S SK Jr  S SKJr  S SKJr  S SKJr  S SKJ	r	  S SK
Jr   " S S\	5      rg	)
é    )Úannotations)ÚIterable)ÚTensor)Úutil)ÚDistillKLDivLoss)ÚSparseEncoderc                  óR   ^ • \ rS rSr\R
                  S4SU 4S jjjrSS jrSrU =r	$ )ÚSparseDistillKLDivLossé   g       @c                ó"   >• [         TU ]  XUS9  g)aî  
Compute the KL divergence loss between probability distributions derived from student and teacher models' similarity scores.
By default, similarity is calculated using the dot-product. This loss is designed for knowledge distillation
where a smaller student model learns from a more powerful teacher model.

The loss computes softmax probabilities from the teacher similarity scores and log-softmax probabilities
from the student model, then calculates the KL divergence between these distributions.

Args:
    model: SentenceTransformer model (student model)
    similarity_fct: Which similarity function to use for the student model
    temperature: Temperature parameter to soften probability distributions (higher temperature = softer distributions)
        When combined with other losses, a temperature of 1.0 is also viable, but a higher temperature (e.g., 2.0 or 4.0)
        can help prevent the student model from going to zero active dimensions. Defaults to 2.0.

References:
    - For more details, please refer to https://arxiv.org/abs/2010.11386

Requirements:
    1. Need to be used in SpladeLoss or CSRLoss as a loss function.
    2. (query, positive, negative_1, ..., negative_n) examples
    3. Labels containing teacher model's scores between query-positive and query-negative pairs

Inputs:
    +------------------------------------------------+------------------------------------------------------------+
    | Texts                                          | Labels                                                     |
    +================================================+============================================================+
    | (query, positive, negative)                    | [Teacher(query, positive), Teacher(query, negative)]       |
    +------------------------------------------------+------------------------------------------------------------+
    | (query, positive, negative_1, ..., negative_n) | [Teacher(query, positive), Teacher(query, negative_i)...]  |
    +------------------------------------------------+------------------------------------------------------------+

Relations:
    - Similar to :class:`~sentence_transformers.sparse_encoder.losses.SparseMarginMSELoss` but uses KL divergence instead of MSE
    - More suited for distillation tasks where preserving ranking is important

Example:

    Using a teacher model to compute similarity scores for distillation:

    ::

        import torch
        from datasets import Dataset

        from sentence_transformers.sparse_encoder import SparseEncoder, SparseEncoderTrainer, losses

        student_model = SparseEncoder("distilbert/distilbert-base-uncased")
        teacher_model = SparseEncoder("naver/splade-cocondenser-ensembledistil")
        train_dataset = Dataset.from_dict(
            {
                "query": ["It's nice weather outside today.", "He drove to work."],
                "positive": ["It's so sunny.", "He took the car to work."],
                "negative": ["It's very cold.", "She walked to the store."],
            }
        )


        def compute_labels(batch):
            emb_queries = teacher_model.encode(batch["query"])
            emb_positives = teacher_model.encode(batch["positive"])
            emb_negatives = teacher_model.encode(batch["negative"])

            pos_scores = teacher_model.similarity_pairwise(emb_queries, emb_positives)
            neg_scores = teacher_model.similarity_pairwise(emb_queries, emb_negatives)

            # Stack the scores for positive and negative pairs
            return {"label": torch.stack([pos_scores, neg_scores], dim=1)}


        train_dataset = train_dataset.map(compute_labels, batched=True)
        loss = losses.SpladeLoss(
            student_model, loss=losses.SparseDistillKLDivLoss(student_model), document_regularizer_weight=3e-5, query_regularizer_weight=5e-5
        )

        trainer = SparseEncoderTrainer(model=student_model, train_dataset=train_dataset, loss=loss)
        trainer.train()


    With multiple negatives:

    ::

        import torch
        from datasets import Dataset

        from sentence_transformers.sparse_encoder import SparseEncoder, SparseEncoderTrainer, losses

        student_model = SparseEncoder("distilbert/distilbert-base-uncased")
        teacher_model = SparseEncoder("naver/splade-cocondenser-ensembledistil")
        train_dataset = Dataset.from_dict(
            {
                "query": ["It's nice weather outside today.", "He drove to work."],
                "positive": ["It's so sunny.", "He took the car to work."],
                "negative1": ["It's very cold.", "She walked to the store."],
                "negative2": ["Its rainy", "She took the bus"],
            }
        )


        def compute_labels(batch):
            emb_queries = teacher_model.encode(batch["query"])
            emb_positives = teacher_model.encode(batch["positive"])
            emb_negatives1 = teacher_model.encode(batch["negative1"])
            emb_negatives2 = teacher_model.encode(batch["negative2"])

            pos_scores = teacher_model.similarity_pairwise(emb_queries, emb_positives)
            neg_scores1 = teacher_model.similarity_pairwise(emb_queries, emb_negatives1)
            neg_scores2 = teacher_model.similarity_pairwise(emb_queries, emb_negatives2)

            # Stack the scores for positive and multiple negative pairs
            return {"label": torch.stack([pos_scores, neg_scores1, neg_scores2], dim=1)}


        train_dataset = train_dataset.map(compute_labels, batched=True)
        loss = losses.SpladeLoss(
            student_model, loss=losses.SparseDistillKLDivLoss(student_model), document_regularizer_weight=3e-5, query_regularizer_weight=5e-5
        )

        trainer = SparseEncoderTrainer(model=student_model, train_dataset=train_dataset, loss=loss)
        trainer.train()
)Úsimilarity_fctÚtemperatureN)ÚsuperÚ__init__)ÚselfÚmodelr   r   Ú	__class__s       €Ú|/home/james-whalen/.local/lib/python3.13/site-packages/sentence_transformers/sparse_encoder/losses/SparseDistillKLDivLoss.pyr   ÚSparseDistillKLDivLoss.__init__   s   ø€ ôv 	‰Ñ˜È;ÐÒWó    c                ó   • [        S5      e)NzSSparseDistillKLDivLoss should not be used alone. Use it with SpladeLoss or CSRLoss.)ÚAttributeError)r   Úsentence_featuresÚlabelss      r   ÚforwardÚSparseDistillKLDivLoss.forwardŠ   s   € ÜÐrÓsÐsr   © )r   r   r   ÚfloatÚreturnÚNone)r   zIterable[dict[str, Tensor]]r   r   r   r   )
Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__r   Úpairwise_dot_scorer   r   Ú__static_attributes__Ú__classcell__)r   s   @r   r
   r
      s*   ø† Ø<@×<SÑ<SÐjm÷ {Xñ {X÷ztò tr   r
   N)Ú
__future__r   Úcollections.abcr   Útorchr   Úsentence_transformersr   Ú-sentence_transformers.losses.DistillKLDivLossr   Ú2sentence_transformers.sparse_encoder.SparseEncoderr   r
   r   r   r   Ú<module>r.      s(   ðÝ "å $å å &Ý JÝ LôtÐ-õ tr   