
    љi8                     0   S SK J r   S SKrS SKrS SKJr  S SKJrJrJrJ	r	J
r
  S SKJr  S SKJrJr  SSKJrJr  S	rS
rSrSr " S S\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r " S S5      r " S S5      rg)    )datetimeN)Path)ListDictTupleUnionOptional)uuid4)	BaseModelField   )
LLMServiceToolCallz[result:true]zCompare the output content and evaluate whether the output is appropriate based on the evaluation criteria. If appropriate, output [result:true]. Regardless of whether it's appropriate or not, provide a reason for your evaluation.	eval_user	turn_userc                   *    \ rS rSr% \\S'   \\S'   Srg)EvaluationResult   resultreason N)__name__
__module____qualname____firstlineno__bool__annotations__str__static_attributes__r       N/home/james-whalen/.local/lib/python3.13/site-packages/aiavatar/eval/dialog.pyr   r      s    LKr    r   c                   H    \ rS rSr% Sr\\   \S'   \\S'   Sr	\\
   \S'   Srg)ToolCallResult   Ndatais_finaltextr   )r   r   r   r   r%   r	   dictr   r   r'   r   r   r   r    r!   r#   r#      s#    D(4.ND(3-r    r#   c                   \    \ rS rSr% \\S'   \\S'   Sr\\\\	4      \S'   Sr
\\   \S'   Srg)r   !   idnameN	argumentsr   r   )r   r   r   r   r   r   r-   r	   r   r(   r   r#   r   r   r    r!   r   r   !   s3    G
I,0Ixc4i()0'+FH^$+r    r   c                       \ rS rSr% Sr\\   \S'   Sr\\   \S'   Sr	\\   \S'   Sr
\\   \S'   Sr\\   \S'   Sr\\   \S'   Sr\\   \S	'   S
rg)Turn(   N
input_textexpected_output_textevaluation_criteriaevaluation_function_nameactual_output_textactual_tool_callevaluation_resultr   )r   r   r   r   r1   r	   r   r   r2   r3   r4   r5   r6   r   r7   r   r   r   r    r!   r/   r/   (   sm     $J$*.(3-.)-#-.2hsm2(,,+/hx(/48x 018r    r/   c                       \ rS rSr% Sr\\   \S'   \" \	S9r
\\   \S'   Sr\\   \S'   Sr\\   \S'   Sr\\   \S'   Sr\\   \S	'   S
\4S jrSrg)Scenario2   Nr,   )default_factoryturnsgoalscenario_evaluation_resultuser_iderrorreturnc                 :    [        S U R                   5       5      $ )Nc              3   <   #    U  H  oR                   S Lv   M     g 7fN)r5   .0turns     r!   	<genexpr>1Scenario.has_execution_results.<locals>.<genexpr>;   s     N:4**$6:s   )allr<   )selfs    r!   has_execution_resultsScenario.has_execution_results:   s    N4::NNNr    r   )r   r   r   r   r,   r	   r   r   r   listr<   r   r/   r=   r>   r   r?   r@   r   rL   r   r   r    r!   r9   r9   2   sl    D(3-d3E4:3D(3-=A)9 :A!GXc]!E8C=Ot Or    r9   c                       \ rS rSrSrg)ValidationError?   r   N)r   r   r   r   r   r   r    r!   rP   rP   ?   s    r    rP   c                   n    \ rS rSr\S\SS4S j5       r\S\SS4S j5       r\S\	\   SS4S	 j5       r
S
rg)DataValidatorC   rG   rA   Nc                 <    U R                   (       d  [        S5      eg )NzTurn must have input_text)r1   rP   )rG   s    r!   validate_turnDataValidator.validate_turnD   s    !"=>> r    scenarioc                     U R                   (       d  [        S5      e[        U R                   5       H  u  p [        R	                  U5        M     g ! [         a  n[        SUS-    SU 35      eS nAff = f)Nz$Scenario must have at least one turnzTurn    : )r<   rP   	enumeraterS   rV   )rX   irG   ves       r!   validate_scenarioDataValidator.validate_scenarioI   sk    ~~!"HII 0GA;++D1 1 # ;%acU"RD&9::;s   A
A5A00A5	scenariosc           	          U (       d  [        S5      e[        U 5       H  u  p [        R                  U5        M     g ! [          a5  n[        SUR                  =(       d    [        US-   5       SU 35      eS nAff = f)Nz*Dataset must contain at least one scenarioz	Scenario rZ   r[   )rP   r\   rS   r_   r,   r   )ra   r]   rX   r^   s       r!   validate_scenarios DataValidator.validate_scenariosT   st    !"NOO$Y/KAU//9 0 # U%	(--2K3qs81LBrd&STTUs   >
A=0A88A=r   )r   r   r   r   staticmethodr/   rV   r9   r_   r   rc   r   r   r    r!   rS   rS   C   sn    ?D ?T ? ? ;H ; ; ; Ud8n U U Ur    rS   c                   t   \ rS rSrS%S\S\S\\\4   S\4S jjrS\S\S	\S
\S\	\\
\S4   4   4
S jrS&S\S\S	\S\	\\
\S4   4   4S jjrS\4S jrS\S\S\S\S\4
S jrS\S\4S jrSSSS.S\
\\   \4   S\S\S\S\\   4
S jjrS'S\\   S\4S  jjrS\\   S!\4S" jr\S!\S\\   4S# j5       rS$rg)(DialogEvaluatora   Nllmevaluation_llmevaluation_functionsdefault_user_idc                     Xl         X l        U R                  (       a0  U R                  R                  (       d  [        U R                  l        U=(       d    0 U l        U=(       d    [
        U l        g rD   )ri   rj   system_prompt DEFAULT_EVALUATION_SYSTEM_PROMPTrk   DEFAULT_AIAVATAR_USER_IDrl   )rK   ri   rj   rk   rl   s        r!   __init__DialogEvaluator.__init__b   sM    ,t':':'H'H0PD-$8$>B!.J2Jr    
context_idr?   r'   rA   c                 B  #    SnS n UR                  UUUS9  S h  vN nUR                  (       a  UR                  nUR                  (       d  M9  XWR                  -  nMI   ND
 XV4$ ! [         a*  n[	        SU S[
        R                  " 5        35        e S nAff = f7f)N rs   r?   r'   zError during turn processing: 
)chat_stream	tool_callr'   	Exceptionprint	traceback
format_exc)	rK   ri   rs   r?   r'   result_textry   respexs	            r!   get_llm_response DialogEvaluator.get_llm_responsej   s     		!oo% .  -d
 >> $I99999,K-  )) 	22$b9M9M9O8PQR	sN   BA( A$A"A$-A( A( "A$$A( 'B(
B2%BBBrG   c                    #    U R                   (       d  [        S5      e[        R                  U5        U R	                  U R                   UU=(       d    U R
                  UR                  S9I S h  vN u  pEXE4$  N	7f)Nz,LLM service is required for processing turnsri   rs   r?   r'   )ri   rP   rS   rV   r   rl   r1   )rK   rs   rG   r?   r5   ry   s         r!   process_turnDialogEvaluator.process_turn|   st     xx!"PQQ##D).2.C.C!3t33	 /D /
 )
% ",,)
s   A.A<0A:1
A<rX   c                   #    [        [        5       5      n[        UR                  S5       H  u  p4[	        SUR
                   SU S[        UR                  5       SUR                  S S  S3	SS	S
9   U R                  X$UR                  5      I S h  vN u  Ul
        nU(       a*  [        R                  UR                  5       5      Ul        M  M     [	        5         g  NL! [         a  n[	        SU SU S35        e S nAff = f7f)NrZ   zProcessing Scenario  - Turn /r[   r:   ...ru   Tendflushz
Error in turn z. Stopping scenario processing.)r   r
   r\   r<   r{   r,   lenr1   r   r?   r5   r   model_validateto_dictr6   rz   )rK   rX   rs   r]   rG   ry   r   s          r!   process_scenario DialogEvaluator.process_scenario   s     \
 3GA*8==/!Ac(..FYEZZ\]a]l]lmpnp]q\rruv|~  GK  L;?;L;LZ_g_o_o;p5p2',4,C,CIDUDUDW,XD) 	 4 	 6q  (2bT1PQRs<   A1D4C"C ;C"D C""
D,C??DDoutput_textry   r3   r4   c                 h  #    U(       a  U(       d  [        S5      eSU SU SU 3nU R                  U R                  [        [	        5       5      [
        US9I S h  vN u  pg[        UR                  5       ;   nU(       a'  U R                  R                  U5      n	U	" XX8U5      u  p[        XS9$  NT7f)Nz5Both output_text and evaluation_criteria are requiredz
## Output
z

## ToolCall
z

## Evaluation Criteria
r   r   r   )rP   r   rj   r   r
   DEFAULT_EVALUATOR_USER_IDEVALUATION_SUCCESS_MARKERlowerrk   getr   )
rK   r   ry   r3   r4   eval_input_texteval_result_text_r   evaluation_functions
             r!   evaluate_turn_output$DialogEvaluator.evaluate_turn_output   s     "5!"YZZ'}4Ei[Pl  nA  mB  C$($9$9##57|- 	 %: %
 
 +.>.D.D.FF#"&";";"?"?@X"Y':;Sf  qA  (B$FvGG
s   AB2B0AB2c                   #    [         R                  U5        UR                  (       d  [        S5      eSnUR                   H?  nUR
                  (       d  [        S5      eUSUR                   SUR
                   S3-  nMA     SU SUR                   S	3nU R                  U R                  [        [        5       5      [        US
9I S h  vN u  pV[        UR                  5       ;   n[        XuS9$  N&7f)Nz(Scenario must have a goal for evaluationru   z>All turns must have actual_output_text for scenario evaluationzUser: z
Assistant: z

z## Full Conversation
z	
## Goal
zN

Evaluate whether the goal was achieved based on the full conversation above.rv   r   )rS   r_   r=   rP   r<   r5   r1   r   rj   r   r
   r   r   r   r   )rK   rX   conversation_textrG   r   r   r   r   s           r!   evaluate_scenario_goal&DialogEvaluator.evaluate_scenario_goal   s     ''1}}!"LMMNND**%&fgg6$//):-H_H_G``d!ee # %%6$7{8==/ R[ [ 	 %)$9$957|- 	 %: %
 
 +.>.D.D.FFvGG
s   C
C5C3'C5TF)detailedoverwrite_executionoverwrite_evaluationdatasetr   r   r   c                  #    [        U[        5      (       a.  Un U R                  U5      n[        S[	        U5       SU 35         [        R                  U5        / n[	        U5      n	[        US5       GH  u  p UR                  5       (       a&  U(       d  [        SU
 SU	 S	UR                   35        O8[        SU
 SU	 S
UR                   35        U R                  U5      I S h  vN    U R                  (       GaL  U(       Ga  [        UR                   5       H  u  pUR"                  (       d  M  UR$                  b	  U(       d  M.  [        SUR&                   SUS-    S[	        UR                   5       SUR(                  S S  S3	SSS9   U R+                  UR,                  UR.                  UR"                  UR0                  S9I S h  vN Ul        M     U(       a+  [3        S UR                    5       5      (       a
  [        5         UR4                  b  U(       a*  [        S5         U R7                  U5      I S h  vN Ul        UR                  U5        [        SU
 S35        GM     [        SU	 S 35        U$ ! [
         a  n[        SU SU 35        e S nAff = f! [         a  n[        SU 35        e S nAff = f GN! [
         aQ  n[        SU
 SU	 SU S35        [        U5      Ul        UR                  U5        [        SU
 S35         S nAGM  S nAff = f GNZ! [
         a!  n[        SUS-    SU S35         S nAGM4  S nAff = f GN! [
         a  n[        SU S35         S nAGN/S nAff = f7f)!NzLoaded z scenarios from zFailed to load dataset from r[   zDataset validation failed: rZ   [r   z] Use pre-executed scenario: z] Processing scenario: z] Scenario failed: z . Continuing with next scenario.u   ✗ Scenario z failedzEvaluating Scenario r   r:   r   ru   Tr   )r   ry   r3   r4   
Turn z evaluation failed: z. Continuing with next turn.c              3   8   #    U  H  oR                   v   M     g 7frD   )r3   rE   s     r!   rH   &DialogEvaluator.run.<locals>.<genexpr>   s     '\^T(@(@^s   zEvaluating overall scenario...zScenario evaluation failed: u   ✓ Scenario z
 completedu   ✓ All z scenario(s) completed!)
isinstancer   load_resultsr{   r   rz   rS   rc   rP   r\   rL   r=   r   r@   appendrj   r<   r3   r7   r,   r1   r   r5   r6   r4   anyr>   r   )rK   r   r   r   r   filepathr   r^   ra   total_scenariosscenario_idxrX   r]   rG   s                 r!   runDialogEvaluator.run   s    gs##H++H5G~-=hZHI	,,W5
 	g,&/&;"L1133<OAl^1_,==Z[c[h[hZijkAl^1_,==TU]UbUbTcde//999 """#,X^^#<333#55AJ^ (!$:8==/RSTURUQVVWX[\d\j\jXkWllnoso~o~  @C  AC  pD  oE  EH  #I  OQ  Y]  ^	)?C?X?X040G0G.2.C.C8<8P8P=A=Z=Z	 @Y @" :" 6 $="  C'\X^^'\$\$\ 66>BV:;cDHD_D_`hDi>i; X&M,z:;] '<` 	))@AB  4XJbEF  	/t45	 : ,q(99LRDPpqr!$R  *l^7;<$:" $- ) %!u4HLh&i j () ?j$ c <RD@`abbcs  N+J J' N<:K6N72K)K*K.AN6AN=L,L)	L,AN/MM	M6N
J$JJ$$N'
K1K  KNK
L&AL!N!L&&N)L,,
M6MNMNM
N'M<6N<NNra   c           
         [        U5       GHR  u  p4UR                  =(       d    [        US-   5      n[        SU S35        [        SUR                   35        U(       Ga  UR
                   Vs/ s H  ofR                  (       d  M  UPM     nnU(       GaZ  [        UR
                  5       GH  u  pUR                  (       d  M  [        SUS-    S35        [        SUR                   35        [        SUR                   35        [        S	UR                   35        [        S
UR                   35        [        SUR                   35        [        SUR                   35        [        SUR                  R                  (       a  SOS 35        [        SUR                  R                   35        GM     [        S U 5       5      n	[!        U5      n
[        SU	 SU
 SX-  S-  S S35        UR"                  (       d  GM  [        S5        [        SUR"                  R                  (       a  SOS 35        [        SUR"                  R                   35        GMU     g s  snf )NrZ   z
=== Scenario z ===zGoal: r   :z	  Input: z  Expected Output: z  Actual Output: z  Actual ToolCall: z  Evaluation Criteria: z  Evaluation Function: z
  Result: u   ✓ PASSu   ✗ FAILz
  Reason: c              3   ^   #    U  H#  oR                   R                  (       d  M  S v   M%     g7f)rZ   N)r7   r   rE   s     r!   rH   0DialogEvaluator.print_results.<locals>.<genexpr>%  s      `tBXBXB_B_s   -	-z

Summary: r   z turns passed (d   z.1fz%)z$
=== Overall Scenario Evaluation ===zGoal Achievement: u   ✓ SUCCESSu
   ✗ FAILEDzReason: )r\   r,   r   r{   r=   r<   r7   r1   r2   r5   r6   r3   r4   r   r   sumr   r>   )rK   ra   r   r]   rX   scenario_namerG   evaluated_turnsjpassedtotals              r!   print_resultsDialogEvaluator.print_results  s   $Y/KA$MM5S1XMOM?$78F8==/*+4<NN"]NDF\F\4N"]"#,X^^#<111!GAaC5"23!Idoo->"?@!$78Q8Q7R"ST!$5d6M6M5N"OP!$78M8M7N"OP!$;D<T<T;U"VW!$;D<Y<Y;Z"[\!JT=S=S=Z=Zz`j.k"lm!Jt/E/E/L/L.M"NO $= ! ` ``F0EKxqv|TWGWX[F\\^_` 222=?*H<_<_<f<f=lx*yz{!D!D!K!K LMN= 0 #^s   0I$I$r   c                    [         R                  " 5       R                  5       U Vs/ s H  o3R                  5       PM     snS.n[	        USSS9 n[
        R                  " XESSS9  S S S 5        g s  snf ! , (       d  f       g = f)N)	timestampra   wutf-8encodingFr   )ensure_asciiindent)r   now	isoformat
model_dumpopenjsondump)rK   ra   r   rX   r%   fs         r!   save_resultsDialogEvaluator.save_results/  sk    !113@IJ	H--/	J

 (C'2aIIdE!< 32 K 32s   A0A55
Bc                    [        U 5      nUR                  5       (       d  [        SU  35      e [        U SSS9 n[        R
                  " U5      nS S S 5        SW;  a  [        S5      e/ n US    H)  n[        R                  U5      nUR                  U5        M+     U$ ! , (       d  f       NV= f! [         a  n[        SU  SU 35      eS nAff = f! [         a  n[        S	U 35      eS nAff = f)
NzDataset file not found: rr   r   zFailed to read file r[   ra   z)Dataset file must contain 'scenarios' keyzFailed to parse scenario data: )r   existsFileNotFoundErrorr   r   loadrz   IOError
ValueErrorr9   r   r   )r   filepath_objr   r%   r   ra   scenario_datarX   s           r!   r   DialogEvaluator.load_results8  s   H~""$$#&>xj$IJJ	Chg6!yy| 7
 d"HII		E!%k!2#22=A  * "3 ! 76 	C0
"RDABB	C  	E>rdCDD	EsL   B3 B"B3 .2C "
B0,B3 0B3 3
C=CC
C3 C..C3)rl   rk   rj   ri   )NNNrD   )T)r   r   r   r   r   r   r   callablerq   r   r   TCr   r/   r   r9   r   r   r   r   r   r   r   r   r   r   re   r   r   r   r    r!   rg   rg   a   s   KJ K
 Kaefiksfsat K  OR K* # PS [^ chilnstvx|t|n}i}c~ $-S - -s -V[\_afgikogoap\pVq -x Hc Hh Heh H  EH H  M] H(HX HBR H4 RVsx  X] FE$x.#*=$> F$ Flp F  QU F  bf  go  bp FPOtH~ O OB=d8n = = s tH~  r    rg   ) r   r   r|   pathlibr   typingr   r   r   r   r	   uuidr
   pydanticr   r   sts.llmr   r   r   r   ro   r   rp   r   r#   r/   r9   rz   rP   rS   rg   r   r    r!   <module>r      s        5 5  % 0 , [ !
 ( & y 
Y ,y ,99 9	Oy 	O	i 	U U<n nr    