
    h+                     L   S SK JrJr  SSKJrJrJr  SSKJrJ	r	  SSK
Jr  SSKJrJrJr  SSKJr  SSKJr  SS	KJr  \R,                  " \5      r\" S
\\5      r     SS\\   S\\\      S\\   S\\   S\\   S\S   S\4S jjr   SS\\   S\\   S\\   S\S\4
S jjrg)    )OptionalTypeVar   )Dataset_concatenate_map_style_datasets_interleave_map_style_datasets)DatasetDictIterableDatasetDict)DatasetInfo)IterableDataset_concatenate_iterable_datasets_interleave_iterable_datasets)
NamedSplit)logging)LiteralDatasetTypeNdatasetsprobabilitiesseedinfosplitstopping_strategyfirst_exhaustedall_exhausted!all_exhausted_without_replacementreturnc                    SSK Jn  SSKJn  U (       d  [	        S5      e[        U 5       H  u  p[        XU45      (       d  [        U	[        [        45      (       aF  U	(       d  [	        SU S35      e[	        SU S[        U	5       S	[        [        U	5      5       S
35      e[	        SU S[        U	5      R                   S35      eUS:X  a  [        X5      (       a  Xg4OXv4u  pM  [        U	W
5      (       a  M  [	        SU
R                   SWR                   SU S35      e   US;  a  [	        U S35      eW
UL a  [        XX#XES9$ [        U UUUUUS9$ )u9  
Interleave several datasets (sources) into a single dataset.
The new dataset is constructed by alternating between the sources to get the examples.

You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects.

    - If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples.
    - If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.

The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`,
in which case, the resulting dataset ends when all datasets have ran out of examples at least one time.

Note for iterable datasets:

In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process.
Therefore the "first_exhausted" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker).

Args:
    datasets (`List[Dataset]` or `List[IterableDataset]`):
        List of datasets to interleave.
    probabilities (`List[float]`, *optional*, defaults to `None`):
        If specified, the new dataset is constructed by sampling
        examples from one source at a time according to these probabilities.
    seed (`int`, *optional*, defaults to `None`):
        The random seed used to choose a source for each example.
    info ([`DatasetInfo`], *optional*):
        Dataset information, like description, citation, etc.
        <Added version="2.4.0"/>
    split ([`NamedSplit`], *optional*):
        Name of the dataset split.
        <Added version="2.4.0"/>
    stopping_strategy (`str`, defaults to `first_exhausted`):
        Three strategies are proposed right now, `first_exhausted`, `all_exhausted` and `all_exhausted_without_replacement`.
        By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
        If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
        When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once.
        Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
        - with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.
        - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
Returns:
    [`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets`
    parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of
    `IterableDataset`.

Example:

    For regular datasets (map-style):

    ```python
    >>> from datasets import Dataset, interleave_datasets
    >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
    >>> d2 = Dataset.from_dict({"a": [10, 11, 12]})
    >>> d3 = Dataset.from_dict({"a": [20, 21, 22]})
    >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
    >>> dataset["a"]
    [10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22]
    >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
    >>> dataset["a"]
    [10, 0, 11, 1, 2]
    >>> dataset = interleave_datasets([d1, d2, d3])
    >>> dataset["a"]
    [0, 10, 20, 1, 11, 21, 2, 12, 22]
    >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
    >>> dataset["a"]
    [0, 10, 20, 1, 11, 21, 2, 12, 22]
    >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
    >>> d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
    >>> d3 = Dataset.from_dict({"a": [20, 21, 22, 23, 24]})
    >>> dataset = interleave_datasets([d1, d2, d3])
    >>> dataset["a"]
    [0, 10, 20, 1, 11, 21, 2, 12, 22]
    >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
    >>> dataset["a"]
    [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]
    >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
    >>> dataset["a"]
    [10, 0, 11, 1, 2]
    >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
    >>> dataset["a"]
    [10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24]
    For datasets in streaming mode (iterable):

    >>> from datasets import interleave_datasets
    >>> d1 = load_dataset('allenai/c4', 'es', split='train', streaming=True)
    >>> d2 = load_dataset('allenai/c4', 'fr', split='train', streaming=True)
    >>> dataset = interleave_datasets([d1, d2])
    >>> iterator = iter(dataset)
    >>> next(iterator)
    {'text': 'Comprar Zapatillas para niña en chancla con goma por...'}
    >>> next(iterator)
    {'text': 'Le sacre de philippe ier, 23 mai 1059 - Compte Rendu...'
    ```
r   )r   )r   z/Unable to interleave an empty list of datasets.aExpected a list of Dataset objects or a list of IterableDataset objects, but element at position   is an empty dataset dictionary.Dataset at position  has at least one split: N
Please pick one to interleave with the other datasets, for example: dataset[''] is a .r   Unable to interleave a  (at position 0) with a  (at position K). Expected a list of Dataset objects or a list of IterableDataset objects.r   z: is not supported. Please enter a valid stopping_strategy.)r   r   r   )arrow_datasetr   iterable_datasetr   
ValueError	enumerate
isinstancer	   r
   listnextitertype__name__r   r   )r   r   r   r   r   r   r   r   idatasetdataset_type
other_types               J/home/james-whalen/.local/lib/python3.13/site-packages/datasets/combine.pyinterleave_datasetsr:      s   N '1JKK)
'_#=>>'K1D#EFF${|}{~ : :  !*1#-FtG}o Vddhimnuivdwcxxz|  stusvv|  ~B  CJ  ~K  ~T  ~T  }U  UV  W  6.8.J.J*Q`Pj %L* G\22),*?*?)@@XYcYlYlXmm{|}{~  J  K ) *.  ii-..hijjw-TE
 	
 -/
 	
    dsetsaxisc                    U (       d  [        S5      e[        U 5       GH  u  pE[        U[        [        45      (       d  [        U[
        [        45      (       aF  U(       d  [        SU S35      e[        SU S[        U5       S[        [        U5      5       S35      e[        SU S[        U5      R                   S	35      eUS
:X  a1  [        U[        5      (       a  [        [        4O[        [        4u  pgM  [        UW5      (       a  M  [        SUR                   SWR                   SU S35      e   W[        L a
  [        XX#S9$ [        XX#S9$ )aW  
Converts a list of [`Dataset`] with the same schema into a single [`Dataset`].

Args:
    dsets (`List[datasets.Dataset]`):
        List of Datasets to concatenate.
    info (`DatasetInfo`, *optional*):
        Dataset information, like description, citation, etc.
    split (`NamedSplit`, *optional*):
        Name of the dataset split.
    axis (`{0, 1}`, defaults to `0`):
        Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
        (horizontally).

        <Added version="1.6.0"/>

Example:

```py
>>> ds3 = concatenate_datasets([ds1, ds2])
```
z0Unable to concatenate an empty list of datasets.r   r    r!   r"   r#   r$   r%   r&   r   r'   r(   r)   r*   )r   r   r=   )r-   r.   r/   r   r   r	   r
   r0   r1   r2   r3   r4   r   r   )r<   r   r   r=   r5   r6   r7   r8   s           r9   concatenate_datasetsr?      s   : KLL&
'G_#=>>'K1D#EFF${|}{~ : :  !*1#-FtG}o Vddhimnuivdwcxxz|  stusvv|  ~B  CJ  ~K  ~T  ~T  }U  UV  W  6.8'.J.J/*Q`biPj %L* G\22),*?*?)@@XYcYlYlXmm{|}{~  J  K ) '. w.uuXX-eeWWr;   )NNNNr   )NNr   )typingr   r   r+   r   r   r   dataset_dictr	   r
   r   r   r,   r   r   r   splitsr   utilsr   utils.py_utilsr   
get_loggerr4   loggerr   r0   floatintr:   r?    r;   r9   <module>rJ      s   $ c c :  l l   # 
		H	% mWo>
 ,0"&"& 	Q
;Q
DK(Q
 3-Q
 ;
	Q

 JQ
 OQ
 Q
l #'"&	9X9X
;
9X J9X 	9X
 9Xr;   