
    h_                         S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSK	r
SSKrSSKJr   SSKJr  SS	KJr  S
 rS rS r SS jrS r " S S5      r " S S5      rS rg! \ a    Sr N:f = f)zTF-specific utils import.    N)partial)ceil)uuid4)get_context)SharedMemory   )configc                 N   [        U [        5      (       a  U $ [        R                  (       a  SS KnO[        S5      eU S   n0 nUR                  5        H  u  pE[        U[        R                  5      (       a-  [        R                  " U  Vs/ s H  ofU   PM	     sn5      X4'   MQ  [        XQR                  5      (       a(  UR                  U  Vs/ s H  ofU   PM	     sn5      X4'   M  [        R                  " U  Vs/ s H  ofU   PM	     sn5      X4'   M     U$ s  snf s  snf s  snf )Nr   FCalled a Tensorflow-specific function but Tensorflow is not installed.)
isinstancedictr	   TF_AVAILABLE
tensorflowImportErroritemsnpndarraystackTensorarray)featurestffirstbatchkvfs          Q/home/james-whalen/.local/lib/python3.13/site-packages/datasets/utils/tf_utils.pyminimal_tf_collate_fnr   $   s    (D!!			bccQKEEa$$xxx 8x!1x 89EH99%%xxx 8x!1x 89EHxxx 8x!1x 89EH  L !9 8 8s   D
D
;D"
c                 >    [        U 5      nSU;   a  US   US'   US	 U$ )Nlabellabels)r   )r   r   s     r   #minimal_tf_collate_fn_with_renamingr#   8   s-    !(+E%.h'NL    c                 J   [         R                  R                  U 5      (       a  [        U R                  5      $ [         R                  R                  U 5      =(       dE    [         R                  R                  U 5      =(       d    [         R                  R                  U 5      $ N)patypesis_listis_numeric_pa_type
value_type
is_integeris_floating
is_decimal)pa_types    r   r*   r*   @   sf    	xx  !'"4"45588w'h288+?+?+HhBHHL_L_`gLhhr$   c                    [        U [        R                  5      (       d  U R                  5       n Sn[        U [        R                  5      (       a  XR                  5          nSnO{[        R                  " [        R                  " U 5      S:H  5      (       a  XS   U S   S-    nO;[        U [        R                  5      (       a  X   nO[        S[        U 5       35      eUb1  UR                  5        V	V
s0 s H  u  pX;   d  U	S;   d  M  X_M     nn	n
U(       ai  [        [        UR                  5       5      S   5      n[        U5       VV	V
s/ s H*  oR                  5        V	V
s0 s H
  u  pXU   _M     sn
n	PM,     nn	nn
U" U40 UD6nU(       aJ  0 nUR                  5        H2  u  p[        R                  " X   5      nUR!                  U5      nUX'   M4     U$ / nUR                  5        H?  u  p[        R                  " X   5      nUR!                  U5      nUR#                  U5        MA     U$ s  sn
n	f s  sn
n	f s  sn
n	nf )NTF   r   zUnexpected type for indices: )r!   	label_idsr"   )r   r   r   numpyintegeritemalldiffRuntimeErrortyper   lenlistvaluesranger   astypeappend)indicesdatasetcols_to_retain
collate_fncollate_fn_argscolumns_to_np_typesreturn_dict
is_batchedr   keyvalueactual_sizei	out_batchcol
cast_dtyper   s                    r   np_get_batchrP   F   s    grzz**--/J'2::&&'
	 A%	&	&
WR[1_5	GRZZ	(	( :4=/JKK! $kkm
+
$/O(O CJ+ 	 
 $u||~.q12JOP[J\]J\Q++-@-JC#Qx--@J\]u00E	288:OCHHUZ(ELL,E"IN	  ;  	288:OCHHUZ(ELL,EU#	  ;
 5
 A]s$   7H6H6I(H<9I<Ic	           
      L  ^ ^^^^^^ [         R                  (       a  SSKmO[        S5      e[	        TS5      (       a  TR
                  mOm[	        TR                  R                  S5      (       a!  TR                  R                  R                  mO'[        T 5      S:  a  [        R                  " S5        Sm[        [        T UUUTSS	9mTR                  5        V	s/ s H  n	TR                  R!                  U	5      PM      sn	mTR#                  TR%                  STR&                  5      /S
9UUUU4S j5       n
TR(                  R*                  R-                  [        T 5      5      nU(       aE  TbB  TR/                  STR1                  STR&                  S9S9nU UU4S jnUR3                  X5      nO&U(       a  UR5                  UR7                  5       5      nUb  UR9                  XxS9nUR;                  U
5      nUb  UU4S jnOUU4S jnUR;                  U5      $ s  sn	f )a  Create a tf.data.Dataset from the underlying Dataset. This is a single-process method - the multiprocess
equivalent is multiprocess_dataset_to_tf.

Args:
    dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.
    cols_to_retain (`List[str]`): Dataset column(s) to load in the
        tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and
        that do not exist in the original dataset.
    collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate
        lists of samples into a batch.
    collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the
        `collate_fn`. Can be empty.
    columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.
    output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to
        `tf.TensorSpec` objects.
    shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
        validation/evaluation.
    batch_size (`int`, default `None`): Size of batches to load from the dataset. Defaults to `None`, which implies that
        the dataset won't be batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`.
    drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,
        defaults to the same setting as shuffle.

Returns:
    `tf.data.Dataset`
r   Nr   random_index_shuffleindex_shufflei zto_tf_dataset() can be memory-inefficient on versions of TensorFlow older than 2.9. If you are iterating over a dataset with a very large number of samples, consider upgrading to TF >= 2.9.F)rB   rC   rD   rE   rF   rG   )input_signaturec                    > TR                  TU /TS9n[        TR                  5       5       VVs0 s H
  u  p#X1U   _M     snn$ s  snnf )N)inpTout)py_function	enumeratekeys)rA   outputrL   rI   rF   	getter_fnr   touts       r   fetch_function%dataset_to_tf.<locals>.fetch_function   sT    	   

 .77J7O7O7Q-RS-R61AY-RSSSs   A   r2   )dtype)rJ   c                    > TR                  U S:H  5      (       a%  TR                  R                  SSTR                  S9n T" X[	        T5      S-
  S9nX4$ )Nr2   r`   l            )shapemaxvalrb   r1   )indexseed	max_index)
reduce_allrandomuniformint64r;   )staterf   shuffled_indexrB   rR   r   s      r   scan_random_index(dataset_to_tf.<locals>.scan_random_index   sY    }}Ub[)) 		))U"(()S1UXY`UadeUefN((r$   )drop_remainderc           
         > U R                  5        VVs0 s H%  u  pUTR                  UTU   R                  5      _M'     snn$ s  snnf r&   r   ensure_shaperd   
input_dictrI   valoutput_signaturer   s      r   ensure_shapes$dataset_to_tf.<locals>.ensure_shapes   sD    [e[k[k[mn[mxsC.>s.C.I.IJJ[mnnns   ,Ac                    > U R                  5        VVs0 s H(  u  pUTR                  UTU   R                  SS  5      _M*     snn$ s  snnf )Nr1   rs   ru   s      r   ry   rz      sM    _i_o_o_qr_qS[SVC.>s.C.I.I!".MNN_qrrrs   /A)r	   r   r   r   hasattrrR   rj   experimentalrS   r;   warningswarnr   rP   r=   dtypesas_dtypefunction
TensorSpecrl   dataDatasetr>   fillcastscanshufflecardinalityr   map)rB   rC   rD   rE   rF   rx   r   
batch_sizerq   rb   r^   
tf_dataset	base_seedro   ry   r\   rR   r   r]   s   `   ``         @@@@r   dataset_to_tfr   v   s   H bcc r)**!66	''	9	9!yy55CCw<*$MM*
  $%'/I 4G3M3M3OP3O%BIIu%3OPD[["--bhh"?!@[AT BT &&s7|4J'3GGD"(((CGD		)  __YB
	''
(>(>(@A
%%j%P
/J	o
	s >>-((W Qs   %H!c                   2    \ rS rSrS rS rS rS rS rSr	g)	SharedMemoryContext   c                      / U l         / U l        g r&   created_shmsopened_shmsselfs    r   __init__SharedMemoryContext.__init__   s    r$   c                     [        [        U5      XS9nU(       a  U R                  R                  U5        U$ U R                  R                  U5        U$ )N)sizenamecreate)r   intr   r@   r   )r   r   r   r   shms        r   get_shmSharedMemoryContext.get_shm   sK    D	D$$S) 
 ##C(
r$   c                     U R                  U[        R                  " U5      [        R                  " U5      R                  -  US9n[        R
                  " X#UR                  S9$ )N)r   r   r   )rb   buffer)r   r   prodrb   itemsizer   buf)r   r   rd   rb   r   r   s         r   	get_arraySharedMemoryContext.get_array   sG    ll2775>BHHUO<T<T+T]cldzz%SWW==r$   c                     U $ r&    r   s    r   	__enter__SharedMemoryContext.__enter__       r$   c                     U R                    H#  nUR                  5         UR                  5         M%     U R                   H  nUR                  5         M     g r&   )r   closeunlinkr   )r   exc_type	exc_value	tracebackr   s        r   __exit__SharedMemoryContext.__exit__   s?    $$CIIKJJL % ##CIIK $r$   r   N)
__name__
__module____qualname____firstlineno__r   r   r   r   r   __static_attributes__r   r$   r   r   r      s    >r$   r   c                   F    \ rS rSrS rS rS r\S 5       r\S 5       r	Sr
g)	NumpyMultiprocessingGeneratori  c                    Xl         X l        X0l        X@l        UR	                  5        VVs/ s H  u  pU[
        R                  L d  M  UPM     snnU l        UR	                  5        VVs0 s H-  u  pXU R                  ;  a  UO[
        R                  " S5      _M/     snnU l	        X`l
        Xpl        Xl        Xl        Xl        UR	                  5        VVs0 s HV  u  pXU R                  ;  a  [        UR                   R"                  5      O![        UR                   R"                  5      S-   _MX     snnU l        g s  snnf s  snnf s  snnf )NU1r1   )rB   rC   rD   rE   r   r   str_string_columnsrb   rF   rx   r   r   rq   num_workersr   rd   rankcolumns_to_ranks)r   rB   rC   rD   rE   rF   rx   r   r   rq   r   rN   rb   specs                 r   r   &NumpyMultiprocessingGenerator.__init__  s&    ,$.5H5N5N5Pe5PzsTY]_]d]dTds5Pe 2779$
9
 T%8%88bhhtnL9$
  !1$,& .335!
5	 D4G4G)GTZZ__%SQUQ[Q[Q`Q`MadeMee5!
 f$
!
s   E 	E +4EAEc              #   V  #    [        U R                  [        [        [	        U R
                  5      U R                  -  5      5      5      nU R                  U R
                  U R                  U R                  XR                  5      u  p#n[        S5      n/ n/ n/ n[        U5       V	s/ s H  oR                  5       PM     n
n	[        U5       V	s/ s H  oR                  5       PM     nn	U R
                  U R                  U R                  U R                  U R                   U R"                  U R$                  S.n['        5        n[        U5       H  n[)        [+        5       5      nSU SU 3S S nUR-                  U5        U R"                  R/                  5        VVs0 s H.  u  nnUUR1                  U SU S3U4[2        R4                  SS9_M0     nnnUR-                  U5        X.   nX:X  a  Ub  UnOS nUUUX   X   S	.UEnUR7                  U R8                  USS
9nUR;                  5         UR-                  U5        M     SnU(       Gdw  [        U5       GH]  nX   R=                  SS9(       d  [?        S5      eX   RA                  5         X~   n[C        S URE                  5        5       5      (       a  Sn  O['        5        nUR/                  5        VVs0 s H-  u  nnUUR1                  Xn    SU 3UU R                   U   SS9_M/     nnnUR/                  5        VVs0 s H  u  nnU[2        RF                  " U5      _M     nnnU R$                   H<  nUU   RI                  SUU   RJ                  S    35      RM                  S5      UU'   M>     S S S 5        Wv   X   RO                  5         GM`     U(       d  GMw  U H  nURQ                  5         M     S S S 5        g s  sn	f s  sn	f s  snnf s  snnf s  snnf ! , (       d  f       Nq= f! , (       d  f       g = f7f)Nspawn)rB   rC   rD   rE   rF   r   r   dw__
   _shapeTrd   rb   r   )worker_namerA   extra_batcharray_ready_eventarray_loaded_event)targetkwargsdaemonF<   )timeoutzData loading worker timed out!c              3   T   #    U  H  n[         R                  " US :  5      v   M      g7f)r   N)r   any).0rd   s     r   	<genexpr>9NumpyMultiprocessingGenerator.__iter__.<locals>.<genexpr>e  s"     P:O266%!),,:Os   &(Ur2   ))minr   r   r   r;   rB   r   distribute_batchesrq   r   r   r>   EventrC   rD   rE   rF   r   r   r   strr   r@   r   r   r   rl   Processworker_loopstartwaitTimeoutErrorclearr   r=   copyviewrd   squeezesetjoin)r   r   per_worker_batchesfinal_batchfinal_batch_workerctxnamesshape_arraysworkersr   array_ready_eventsarray_loaded_events	base_argsshm_ctxrL   worker_random_idr   rN   r   worker_shape_arraysworker_indicesfinal_batch_argworker_kwargsworkerend_signal_receivedarray_shapesbatch_shm_ctxrd   arraysarr
string_cols                                  r   __iter__&NumpyMultiprocessingGenerator.__iter__*  s    $**CS5F5X0Y,Z[>B>U>ULL$//4+>+>\\?
;); '"383EF3Eaiik3EF49+4FG4Fqyy{4FG ||"11//#33#'#;#; $ 5 5"11
	 !"g;'#&uw<  #A3a(8'9:3B?[) &*%:%:%@%@%B'%B	T **k]!C5+GPTw^`^f^fos*tt%B $ ' ##$78!3!6*{/F&1O&*O#.-#2);)>*=*@!  ! D,<,<][_`v&5 (8 #(){+A-055b5A*+KLL&)//1#/?LP,:M:M:OPPP /3+ -.- /;.@.@.B" /C
U  !8!8#(8*AcU 3&+&*&>&>s&C',	 "9 "  /C  " EKLLN!SNS#rwws|"3N!S*.*=*=J &z 2 7 7!F:<N<T<TUW<X;Y8Z [ c cdf g #:. +> /& !L'*..0K , *)R " "O #" GG"'X" "T /.a #"s   B&P)(O+P)O0)AP)AP 5O5
C6PP4O;P)$PAP*P	P"P)5P;P
PP
P&"P)c                     U $ r&   r   r   s    r   __call__&NumpyMultiprocessingGenerator.__call__  r   r$   c                 ^  ^ ^^^^^^	^
^^ S[         R                  S'   [        R                  (       a  SS KnO[        S5      eUR                  R                  / S5        UU
UUUUU UUU	4
S jn[        5        nUR                  5        VVs0 s H-  u  nnXR                  T	 SU S3U4[        R                  S	S
9_M/     snnmU H  nU" U5        M     Ub  U" U5        TR                  5        H  u  nnSUS S & M     T
R                  5         S S S 5        g s  snnf ! , (       d  f       g = f)N3TF_CPP_MIN_LOG_LEVELr   r   GPUc           
        >
 [        U TTT	T
TSS9n0 n[        5        nTR                  5        Hx  u  pEX   nUT;   a-  UR                  S5      R	                  UR
                  S-   5      nUR
                  TU   S S & UR                  T SU 3UR
                  USS9X$'   XbU   S S & Mz     TR                  5         TR                  5         TR                  5         S S S 5        g ! , (       d  f       g = f)NT)rA   rB   rC   rD   rE   rF   rG   r   )r2   r   r   )
rP   r   r   r   reshaperd   r   r   r   r   )rA   r   
out_arraysr   rN   rO   r   r   r   rD   rE   rC   rF   rB   r   r   r   s          r   send_batch_to_parentGNumpyMultiprocessingGenerator.worker_loop.<locals>.send_batch_to_parent  s     -% /$7 E J$&- (;'@'@'BOC!JEn, !&

4 0 8 8u9L M+0;;L%a(&3&=&=&-q.ekk\` '> 'JO */sOA& (C "%%'"'')"((*% '&&s   B=C##
C1r   r   Fr   r2   )osenvironr	   r   r   r   set_visible_devicesr   r   r   r   rl   r   )rB   rC   rD   rE   rF   r   r   rA   r   r   r   r   r   r  r   rN   r   r   r   r   s   ````` `  ```       @r   r   )NumpyMultiprocessingGenerator.worker_loop  s    .1

)*#fgg
		%%b%0	+ 	+B !"g "2!7!7!9!9IC &&+auF'CD7Z\ZbZbkp&qq!9L
 !$U+ !&$[1*002
Ua 3!!# #" #"s   8D4D ADD
D,c                    [         R                  " [        U 5      5      nU(       a  [         R                  R	                  U5        [        U5      nXfU-  -
  n[         R
                  " XW/5      u  pXU(       d  [        U5      S:X  a  S nUR                  SU5      n[        U5      n	XU-  -
  n
[         R
                  " XZ/5      u  p[UR                  SX15      n[         R
                  " XUR                  S   SS9nU Vs/ s H  n[         R                  " US5      PM     nn[        [        U5      5       H0  n[         R                  " X   X   R                  SS5      /SS9X'   M2     Ub  [        U5      nOS nXU4$ s  snf )Nr   r2   r1   )axis)r   aranger;   rj   r   splitr  rd   r   r>   concatenate)rB   r   rq   r   r   rA   num_samplesincomplete_batch_cutofflast_incomplete_batchnum_batchesfinal_batches_cutofffinal_batchesper_worker_indicesr   rL   incomplete_batch_worker_idxs                   r   r   0NumpyMultiprocessingGenerator.distribute_batches  sb   ))CL)IIg&'l #.z1I"J)+';T)U&S!671<$(!//"j1'l*K.GH!#'3I!J//"k>XXg}}Q/?aHRdeRdbjj;Rdes=)*A$&NN4F4I=K[KcKcdegiKj3krs$t! + !,*-m*<'*.'!:UUU fs   >!E>)r   rD   rE   rC   rF   r   rB   rq   r   rx   r   r   N)r   r   r   r   r   r  r  staticmethodr   r   r   r   r$   r   r   r     sA     
D_B E$ E$N V Vr$   r   c
                    [         R                  (       a  SSKn
O[        S5      e[	        U UUUUUUUUU	S9
nU
R
                  R                  R                  XS9nU(       a  [        [        U 5      U-  5      nO [        [        [        U 5      U-  5      5      nUR                  U
R
                  R                  R                  U5      5      $ )a  Create a tf.data.Dataset from the underlying Dataset. This is a multi-process method - the single-process
equivalent is dataset_to_tf.

Args:
    dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.
    cols_to_retain (`List[str]`): Dataset column(s) to load in the
        tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and
        that do not exist in the original dataset.
    collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate
        lists of samples into a batch.
    collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the
        `collate_fn`. Can be empty.
    columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.
    output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to
        `tf.TensorSpec` objects.
    shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
        validation/evaluation.
    batch_size (`int`, default `None`): Size of batches to load from the dataset. Defaults to `None`, which implies that
        the dataset won't be batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`.
    drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,
        defaults to the same setting as shuffle.
    num_workers (`int`): Number of workers to use for loading the dataset. Should be >= 1.

Returns:
    `tf.data.Dataset`
r   Nr   )
rB   rC   rD   rE   rF   rx   r   r   rq   r   )rx   )r	   r   r   r   r   r   r   from_generatorr   r;   r   applyr}   assert_cardinality)rB   rC   rD   rE   rF   rx   r   r   rq   r   r   data_generatorr   dataset_lengths                 r   multiprocess_dataset_to_tfr*    s    L bcc2%'/)%N ///bJS\Z78T#g,";<=BGG00CCNSTTr$   )F)__doc__r  r~   	functoolsr   mathr   uuidr   r4   r   pyarrowr'   multiprocessr   multiprocess.shared_memoryr   r    r	   r   r#   r*   rP   r   r   r   r*  r   r$   r   <module>r3     s      	       $7 (i ej-`n)b @mV mV`=Us  Ls   A" "A-,A-