
    ha9                        S SK r S SKrS SKrS SKJr  S SKJr  S SKJr  S SK	J
r
JrJr  S SKrS SKrS SKrS SKJrJr  S SKJr  S SKJrJr  S S	KJr  S S
KJr  S SKJr  \R@                  RB                  RE                  \#5      r$\
(       a  S SK%r%S SK&r%\ " S S\RN                  5      5       r(SSS\)\*   4S jr+ SSSS\)\*   S\\,   4S jjr- " S S\5      r. " S S\R^                  5      r0g)    N)Iterable)	dataclass)islice)TYPE_CHECKINGOptionalUnion)ArrowWriterParquetWriter)MAX_SHARD_SIZE)is_remote_filesystemrename)_BaseExamplesIterable)experimental)convert_file_size_to_intc                   V   ^  \ rS rSr% SrSr\\R                     \	S'   U 4S jr
SrU =r$ )SparkConfig   zBuilderConfig for Spark.Nfeaturesc                 "   > [         TU ]  5         g N)super__post_init__)self	__class__s    _/home/james-whalen/.local/lib/python3.13/site-packages/datasets/packaged_modules/spark/spark.pyr   SparkConfig.__post_init__%   s         )__name__
__module____qualname____firstlineno____doc__r   r   datasetsFeatures__annotations__r   __static_attributes____classcell__r   s   @r   r   r      s%    ",0Hhx(()0   r   r   dfpyspark.sql.DataFramenew_partition_orderc                     U R                  S5      R                  SUS    35      nUSS   H7  nU R                  S5      R                  SU 35      nUR                  U5      nM9     U$ )N*z
part_id = r      )selectwhereunion)r*   r,   df_combinedpartition_idpartition_dfs        r   _reorder_dataframe_by_partitionr6   )   sp    ))C.&&4G4J3K'LMK+AB/yy~++j,GH!''5 0 r   partition_order
state_dictc              #      #    SS K nU R                  SUR                  R                  R	                  5       R                  S5      5      nU(       a  US   OSn[        XAUS  5      nUR                  SS9nS nU(       a  US   OSn	[        XyS 5       Hh  n
U
R                  5       nUS   nUR                  S5        X:w  a  U(       a  Ub  US==   S-  ss'   UnSn	U(       a  U	S-   US'   U S	U	 3U4v   U	S-  n	Mj     g 7f)
Nr   r.   part_idpartition_idxT)prefetchPartitionspartition_example_idxr/   _)pysparkr0   sql	functionsspark_partition_idaliasr6   toLocalIteratorr   asDictpop)r*   r7   r8   r?   df_with_partition_idpartition_idx_startr5   rowscurr_partitionrow_idrowrow_as_dictr:   s                r   _generate_iterable_examplesrN   1   s    
 99S'++*?*?*R*R*T*Z*Z[d*ef9C*_523GYlYmInoL''4'@DN4>Z/0AFdD)jjli(	"$n8?+q0+$NF281*J./	6(#[00! *s   C<C>c                      ^  \ rS rSr S SU 4S jjjrS\4S jr\S\S\4U 4S jj5       rS r	S\
R                  R                  SS 4S	 jrSS
\S\SS 4S jjr\S\4S j5       rSrU =r$ )SparkExamplesIterableN   c                    > [         TU ]  5         Xl        U=(       d-    [        U R                  R                  R                  5       5      U l        g r   )r   __init__r*   rangerddgetNumPartitionsr7   )r   r*   r7   r   s      r   rS   SparkExamplesIterable.__init__O   s8    
 	.W%8T8T8V2Wr   returnc                 .    SSS.U l         U R                   $ )Nr   )r;   r=   )_state_dictr   s    r   _init_state_dict&SparkExamplesIterable._init_state_dictX   s    -.Kr   r8   c                 "   > [         TU ]  U5      $ r   )r   load_state_dict)r   r8   r   s     r   r_   %SparkExamplesIterable.load_state_dict\   s    w&z22r   c              #   v   #    [        U R                  U R                  U R                  5       S h  vN   g  N7fr   )rN   r*   r7   rZ   r[   s    r   __iter__SparkExamplesIterable.__iter__`   s'     .tww8L8LdN^N^___s   /979	generatorc                     [        [        U R                  R                  R	                  5       5      5      nUR                  U5        [        U R                  US9$ )Nr7   )listrT   r*   rU   rV   shufflerP   )r   rd   r7   s      r   shuffle_data_sources*SparkExamplesIterable.shuffle_data_sourcesc   sA    uTWW[[%A%A%CDE/*$TWWoNNr   
num_shardsindexc                 J    U R                  XUS9n[        U R                  US9$ )N)rk   rl   
contiguousrf   )split_shard_indices_by_workerrP   r*   )r   rk   rl   rn   r7   s        r   shard_data_sources(SparkExamplesIterable.shard_data_sourcesh   s)    <<
lv<w$TWWoNNr   c                 ,    [        U R                  5      $ r   )lenr7   r[   s    r   rk    SparkExamplesIterable.num_shardsl   s    4''((r   )rZ   r*   r7   r   )r*   r+   )T)r   r    r!   r"   rS   dictr\   r   r_   rb   nprandom	Generatorri   intrp   propertyrk   r'   r(   r)   s   @r   rP   rP   N   s     X#X X $   3$ 34 3 3`Obii.A.A OF] O
OS O ORi O )C ) )r   rP   c                   "  ^  \ rS rSr\r  SSSS\S\4U 4S jjjrS rS r	S	\
R                  R                  R                  4S
 jrS rS\S\S\S\\\\\\\4   4      4S jr   SSSS\S\\\\4      S\\   4S jjrSSS\4S jrSrU =r$ )Sparkq   r*   r+   	cache_dirworking_dirc                    > SS K nUR                  R                  R                  R	                  5       U l        Xl        X0l        [        TU ]$  " SU[        U R                  R                  5       5      S.UD6  g )Nr   )r~   config_namer   )r?   r@   SparkSessionbuildergetOrCreate_sparkr*   _working_dirr   rS   strsemanticHash)r   r*   r~   r   config_kwargsr?   r   s         r   rS   Spark.__init__t   sg     	kk..66BBD' 	
DGG0023	
 	
r   c                   ^ U R                   mU4S jnU R                  R                  R                  SS5      R	                  S5      (       a  g U R                   (       at  U R                  R
                  R                  [        S5      S5      R                  U5      R                  5       n[        R                  R                  US   5      (       a  g [        S5      e)Nc                    > [         R                  " TSS9  [         R                  R                  TS[        R
                  " 5       R                  -   5      n[        US5        U/$ )NT)exist_okfs_testa)osmakedirspathjoinuuiduuid4hexopen)context
probe_filer~   s     r   create_cache_and_write_probe?Spark._validate_cache_dir.<locals>.create_cache_and_write_probe   sL     KK	D1iTZZ\=M=M1MNJ S!<r   zspark.master localr/   r   ztWhen using Dataset.from_spark on a multi-node cluster, the driver and all workers should be able to access cache_dir)
_cache_dirr   confget
startswithsparkContextparallelizerT   mapPartitionscollectr   r   isfile
ValueError)r   r   prober~   s      @r   _validate_cache_dirSpark._validate_cache_dir   s     OO		  ;;3>>wGG
 ??((44U1XqAOOPlmuuw  ww~~eAh'' C
 	
r   c                 R    [         R                  " U R                  R                  S9$ )N)r   )r$   DatasetInfoconfigr   r[   s    r   _infoSpark._info   s    ##T[[-A-ABBr   
dl_managerc                 \    [         R                  " [         R                  R                  S9/$ )N)name)r$   SplitGeneratorSplitTRAIN)r   r   s     r   _split_generatorsSpark._split_generators   s     ''X^^-A-ABCCr   c                    SS K nS nU R                  R                  5       nUS::  a  UOSnU R                  R                  U5      R	                  S5      R                  US5      R                  UR                  R                  R                  S5      R                  S5      5      R                  5       S   R                  U-  nXd-  nXq:  a8  [        U[        Xq-  5      5      nU R                  R	                  U5      U l        g g )Nr   c              3   z   #    U  H1  n[         R                  R                  SUR                  /05      v   M3     g 7f)Nbatch_bytes)paRecordBatchfrom_pydictnbytes)itbatchs     r   get_arrow_batch_size=Spark._repartition_df_if_needed.<locals>.get_arrow_batch_size   s/     nn00-%,,1PQQ s   9;d   r/   zbatch_bytes: longr   sample_bytes)r?   r*   countlimitrepartition
mapInArrowaggr@   rA   sumrC   r   r   minry   )	r   max_shard_sizer?   r   df_num_rowssample_num_rowsapprox_bytes_per_rowapprox_total_sizenew_num_partitionss	            r   _repartition_df_if_neededSpark._repartition_df_if_needed   s    	R ggmmo)4);+ GGMM/*[^Z,.ABS&&**=9??OPWYq	
 \ 	 1>-!$[#6G6X2Y!Zgg))*<=DG .r   fpathfile_formatr   rX   c           	   #   D  ^^^^^	^
^^^#    SS K m	US:X  a  [        O[        mU R                  (       aG  [        R
                  R                  U R                  [        R
                  R                  T5      5      OTmUS:H  mU R                  R                  mU R                  mU R                  R                  m
UUUUU	U
UUU4	S jnU R                  R                  US5      R                  S5      R!                  T	R"                  R$                  R'                  S5      R)                  S5      T	R"                  R$                  R'                  S5      R)                  S	5      T	R"                  R$                  R+                  S5      R)                  S
5      T	R"                  R$                  R-                  S5      R)                  S5      5      R/                  5       nU H?  nUR0                  UR2                  UR4                  UR6                  UR8                  44v   MA     g 7f)Nr   parquetc           	   3   4  >	#    TR                   " 5       R                  5       n[        U S 5      nUc&  [        R                  R                  U/S/S/// SQS9$ SnT" TTR                  SUS 5      R                  SUS 5      TTTS9n[        R                  R                  U/5      nUR                  U5        U  H  nTb  UR                  T:  a  UR                  5       u  pxUR                  5         [        R                  R                  U/U/U/// SQS9v   US-  nT" UR                  TR                  SUS 5      R                  SUS 5      TTTS9n[        R                  R                  U/5      nUR                  U5        M     UR                  S:  aJ  UR                  5       u  pxUR                  5         [        R                  R                  U/U/U/// SQS9v   TT:w  a  [        R                  " [        R                   R#                  T5      5       Hr  n	[        R                   R%                  [        R                   R#                  T5      [        R                   R'                  U	5      5      n
[(        R+                  X5        Mt     g g 7f)	Nr   )task_idnum_examples	num_bytes)namesSSSSS05dTTTTT)r   r   writer_batch_sizestorage_optionsembed_local_filesr/   )TaskContexttaskAttemptIdnextr   r   from_arraysreplaceTablefrom_batcheswrite_table
_num_bytesfinalizeclose	_featuresr   listdirr   dirnamer   basenameshutilmove)r   r   first_batchshard_idwritertabler   r   r   filedestr   r   r   r   r?   r   working_fpathr   writer_classs              r   write_arrow0Spark._prepare_split_single.<locals>.write_arrow   sf    ))+99;Gr4.K"~~11YaS)B 2   H!!"**7xnFNNw[bcfZgi"3 /"3F HH));-8Eu%!-&2C2C~2U.4oo.?+LLLN..44!\NYK@F 5   MH)!'!1!1*227xnNVVW^cjknboq*;(7*;F --ug6""5)# &   1$*0//*;'nn00Y<B 1  
 %JJrww}'EFD77<<(>@P@PQU@VWDKK+ G &s   JJz2task_id: long, num_examples: long, num_bytes: longr   r   total_num_examplesr   total_num_bytesrk   shard_lengths)r?   r
   r	   r   r   r   r   r   r   r   _writer_batch_size_fsr   r*   r   groupByr   r@   rA   r   rC   r   collect_listr   r   r   r   rk   r   )r   r   r   r   r   statsrL   r   r   r?   r   r   r   r   s    ` `   @@@@@@@r   _prepare_split_singleSpark._prepare_split_single   s     	(3y(@}kTXTeTeT%6%68H8H8OPkp'94 ;;'' 33((222	, 2	,j GG{,`aWYS%%)).9??@TU%%))+6<<=NO%%++K8>>|L%%22>BHHY	 WY 	 C++ 6 68K8KS^^]`]n]nooo s   HH split_generatorzdatasets.SplitGeneratornum_procc                    ^^^^ U R                  5         [        U=(       d    [        5      nU R                  U5        [	        U R
                  5      (       + nU(       a  [        R                  R                  O[        R                  nSnU R                   SUR                   U SU 3n	U" U R                  U	5      mSn
SnSm/ n/ nU R                  TX#5       HG  u  pUu  nnnnUS:  d  M  U
U-  n
UU-  nTU-  mUR                  UU45        UR                  U5        MI     XR                  l        XR                  l        [$        R'                  ST S35        TS:  a  XR                  l        U R
                  mS[*        S	[*        S
[*        4UUU4S jjm/ nSn[-        [/        U5      5       H6  nUU   u  nn[-        U5       H  nUR                  UUU/5        US-  nM     M8     U R0                  R2                  R5                  U[/        U5      5      R7                  U4S j5      R9                  5         g SnUS   S   nU R;                  TR=                  SUS 5      R=                  SUS 5      TR=                  US5      5        g )Nz-TTTTT-SSSSS-of-NNNNN-.r   z	Renaming z shards.r/   r   r   global_shard_idc           	         > [        TTR                  SUS 5      R                  SU S 5      TR                  SUS 5      R                  STS 5      5        g )Nr   r   r   zTTTTT-SSSSSNNNNN)r   r   )r   r   r  r   fstotal_shardss      r   _rename_shard+Spark._prepare_split.<locals>._rename_shardO  s^    
 MM'hs^=EEgRYZ]Q^`MM-OC3HJRRSZ_klo^prr   c                    > T" U 6 $ r   r   )argsr  s    r   <lambda>&Spark._prepare_split.<locals>.<lambda>a  s    S`bfSgr   r   r   r   r   )r   r   r   r   r   r  r   r   r   	posixpathr   _output_dirr  appendextend
split_infor   r   loggerdebugr   ry   rT   rs   r   r   r   mapr   _renamer   )r   r  r   r   r	  kwargsis_local	path_joinSUFFIXfnamer   r   task_id_and_num_shardsall_shard_lengthsr   contentr   r   rk   r   r  r  ir   r  r   r  r  s                           @@@@r   _prepare_splitSpark._prepare_split  s~    	  "1.2RNS&&~6+DHH55$,BGGLL)..	(99+Q334VHAk]K$**E2!# $ : :5+ ^G 1}"l2"9,
*&--w
.CD!((7 !_ 3E""//>"", 	yh78!7H&&4 B			 "%	 	 DO3567&<Q&?# %j 1HKK(O DE#q(O !2 8
 KK$$00s4yAEEFghppr H,Q/2GLLg(39AA'gVY]\fb)r   c                 ,    [        U R                  5      $ r   )rP   r*   )r   r  s     r    _get_examples_iterable_for_split&Spark._get_examples_iterable_for_splitk  s     %TWW--r   )r   r   r*   )NN)arrowNN)r   r    r!   r"   r   BUILDER_CONFIG_CLASSr   rS   r   r   r$   downloaddownload_managerDownloadManagerr   r   ry   r   tupleboolr   r  r   r*  rP   r-  r'   r(   r)   s   @r   r|   r|   q   s!   &
 	
#
 
 	
 
&
BCDH,=,=,N,N,^,^ D>2RpRp Rp 	Rp
 
%T5e#445	6Rpn #48"&N2N N !sCx1	N
 3-N`.2. 
. .r   r|   r   )1r   r  r   collections.abcr   dataclassesr   	itertoolsr   typingr   r   r   numpyrv   pyarrowr   r$   datasets.arrow_writerr	   r
   datasets.configr   datasets.filesystemsr   r   datasets.iterable_datasetr   datasets.utilsr   datasets.utils.py_utilsr   utilslogging
get_loggerr   r  r?   pyspark.sqlBuilderConfigr   rg   ry   r6   ru   rN   rP   DatasetBuilderr|   r   r   r   <module>rH     s    	   $ !  1 1    < * < ' < 
			*	*8	4  (((    (? VZ[^V_  "&#Y : )1  )F~.H## ~.r   