
    -ji                     R    S SK Js  Jr  SSKJr  SSKJr  SSKJ	r	J
r
   " S S5      rg)    N   )Settings)Model)load_promptsprintc                   j    \ rS rSrS\S\4S jrS\S\4S jr	S\
4S jrS\\\\4   \\
4   4S	 jrS
rg)	Evaluator   settingsmodelc                    Xl         X l        [        5         [        SUR                  R                   S35        [        UR                  5      U l        [        S[        U R                  5       S35        [        S5        UR                  U R                  5      U l	        [        5         [        SUR                  R                   S35        [        UR                  5      U l        [        S[        U R                  5       S35        [        S5        U R                  5       U l        [        SU R                   S	[        U R                  5       35        g )
Nz+Loading good evaluation prompts from [bold]z[/]...z* [bold]z[/] prompts loadedz4* Obtaining first-token probability distributions...z*Loading bad evaluation prompts from [bold]z* Counting model refusals...z* Initial refusals: [bold][/]/)r   r   r   good_evaluation_promptsdatasetr   good_promptslenget_logprobs_batchedbase_logprobsbad_evaluation_promptsbad_promptscount_refusalsbase_refusals)selfr   r   s      K/home/james-whalen/.local/lib/python3.13/site-packages/heretic/evaluator.py__init__Evaluator.__init__   s'    
9(:Z:Z:b:b9ccij	
 ))I)IJT../00BCDDE"778I8IJ89X9X9`9`8aagh	
 ((G(GHT--.//ABC,-!002(););(<DTEUEUAV@WX	
    responsereturnc                 8   UR                  5       (       d  gUR                  5       R                  SS5      nUR                  SS5      nSR                  UR	                  5       5      nU R
                  R                   H  nUR                  5       U;   d  M    g   g)NT* u   ’' F)striplowerreplacejoinsplitr   refusal_markers)r   r   markers      r   
is_refusalEvaluator.is_refusal'   s    ~~ >>#++C4 ##E3/ 88HNN,-mm33F||~) 4 r   c                     U R                   R                  U R                  5      nU Vs/ s H  o R                  U5      (       d  M  UPM     nn[	        U5      $ s  snf )N)r   get_responses_batchedr   r,   r   )r   	responsesr   refusalss       r   r   Evaluator.count_refusals;   sI    JJ44T5E5EF	-6TY//(:SHYT8} Us   AAc                    [        S5        U R                  R                  U R                  5      n[        R
                  " UU R                  SSS9R                  5       n[        SUS S35        [        S5        U R                  5       n[        S	U S
[        U R                  5       35        X R                  R                  -  X0R                  -  4nXBU4$ )Nz6  * Obtaining first-token probability distributions...	batchmeanT)	reduction
log_targetz  * KL divergence: [bold]z.4fz[/]z  * Counting model refusals...z  * Refusals: [bold]r   )r   r   r   r   Fkl_divr   itemr   r   r   r   kl_divergence_scaler   )r   logprobskl_divergencer1   scores        r   	get_scoreEvaluator.get_score@   s    FG::2243D3DE!	

 $& 	 	)-)<C@A./&&($XJd3t7G7G3H2IJK ]]>>>***

 X--r   )r   r   r   r   r   r   N)__name__
__module____qualname____firstlineno__r   r   r   strboolr,   intr   tuplefloatr>   __static_attributes__ r   r   r	   r	      sU    
 
% 
63 4 ( 
.5ue|!4eS!@A .r   r	   )torch.nn.functionalnn
functionalr7   configr   r   r   utilsr   r   r	   rJ   r   r   <module>rP      s"         &I. I.r   