
    h                         S SK rS rSS jrg)    Nc                     X-  U R                  5       -  n[        R                  " U5      n[        XR                  5       -
  5      nUS:  a  X4-
  n[        R                  " [        R
                  " U5      5      SSS2   nU HZ  n[        R                  " Xh:H  5      u  n	[        [        U	5      U5      n
UR                  XSS9n	XI==   S-  ss'   XZ-  nUS:X  d  MZ    O   UR                  [        R                  5      $ )a  Computes approximate mode of multivariate hypergeometric.
This is an approximation to the mode of the multivariate
hypergeometric given by class_counts and n_draws.
It shouldn't be off by more than one.
It is the mostly likely outcome of drawing n_draws many
samples from the population given by class_counts.
Args
----------
class_counts : ndarray of int
    Population per class.
n_draws : int
    Number of draws (samples to draw) from the overall population.
rng : random state
    Used to break ties.
Returns
-------
sampled_classes : ndarray of int
    Number of samples drawn from each class.
    np.sum(sampled_classes) == n_draws

r   NF)sizereplace   )sumnpfloorintsortuniquewhereminlenchoiceastypeint64)class_countsn_drawsrng
continuousflooredneed_to_add	remaindervaluesvalueindsadd_nows              Q/home/james-whalen/.local/lib/python3.13/site-packages/datasets/utils/stratify.pyapproximate_moder       s    0 ',*:*:*<<Jhhz"G g-.KQ(	9-.tt4 Ehhy12GT
 #d)[1G::d%:@DMQM"Ka  >>"((##    c           	   #   2  #    [         R                  " U SS9u  pVUR                  S   n[         R                  " U5      n[         R                  " U5      S:  a  [        S5      eX:  a  [        SX4-  5      eX':  a  [        SX'4-  5      e[         R                  " [         R                  " USS	9[         R                  " U5      S
S 5      n	[        U5       H  n
[        XU5      nX-
  n[        XU5      n/ n/ n[        U5       Ha  nUR                  UU   5      nU	U   R                  USS9nUR                  US
UU    5        UR                  UUU   UU   UU   -    5        Mc     UR                  U5      nUR                  U5      nX4v   M     g
7f)aP  

Provides train/test indices to split data in train/test sets.
It's reference is taken from StratifiedShuffleSplit implementation
of scikit-learn library.

Args
----------

n_train : int,
    represents the absolute number of train samples.

n_test : int,
    represents the absolute number of test samples.

random_state : int or RandomState instance, default=None
    Controls the randomness of the training and testing indices produced.
    Pass an int for reproducible output across multiple function calls.

n_splits : int, default=10
    Number of re-shuffling & splitting iterations.
T)return_inverser      zMinimum class count errorzLThe train_size = %d should be greater or equal to the number of classes = %dzKThe test_size = %d should be greater or equal to the number of classes = %d	mergesort)kindNr   clip)mode)r	   r   shapebincountr   
ValueErrorsplitargsortcumsumranger    permutationtakeextend)yn_trainn_testr   n_splitsclasses	y_indices	n_classesr   class_indices_n_iclass_counts_remainingt_itraintestir0   perm_indices_class_is                      r   )stratified_shuffle_split_generate_indicesrC   6   s    . 1T:Ga I;;y)L	vvla455Z^e]qq
 	
 Y]c\oo
 	
 HHRZZ	DbiiP\F]^a_aFbcM8_|c:!-!35sCy!A//,q/:K#0#3#8#86#8#R LL-hA78KK,SVc!fs1voFG	 "
 &t$k! s   FF)
   )numpyr	   r    rC    r!   r   <module>rG      s    /$d5r!   