
    h#                        S SK r S SKJr  S SKJrJrJrJrJrJ	r	J
r
JrJr  SSKJrJr  \\
\   \4   r\" S5      r\\\   /\\\      4   rSS.S\S	\S
\S\	\\/\4      S\4
S jjrSS.S\S\S
\S\	\\/\4      S\4
S jjr SS\S\	\\/\4      S\4S jjrSS\4S\\   S\S	\S
\S\S\\\      4S jjrSS\4S\\   S\S\\\      4S jjr\4S\
\   S\S\\\      4S jjrg)    N)partial)	AnyCallableIterableIteratorListOptionalSequenceTypeVarUnion   )	minibatchregistryItemT)
get_lengthsizebufferdiscard_oversizer   returnc                 <    Ub  SU0O0 n[        [        4U UUS.UD6$ )aU  Create a batcher that uses the `batch_by_padded_size` strategy.

The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch.

size (int or Sequence[int]): The largest padded size to batch sequences into.
    Can be a single integer, or a sequence, allowing for variable batch sizes.
buffer (int): The number of sequences to accumulate before sorting by length.
    A larger buffer will result in more even sizing, but if the buffer is
    very large, the iteration order will be less random, which can result
    in suboptimal training.
discard_oversize (bool): Whether to discard sequences that are by themselves
    longer than the largest padded batch size.
get_length (Callable or None): Function to get the length of a sequence item.
    The `len` function is used by default.
r   )r   r   r   )r   minibatch_by_padded_size)r   r   r   r   	optionalss        Q/home/james-whalen/.local/lib/python3.13/site-packages/spacy/training/batchers.py"configure_minibatch_by_padded_sizer      s=    0 /9.Dz*"I )	
      	tolerancec                 <    Ub  SU0O0 n[        [        4U UUS.UD6$ )a  Create a batcher that uses the "minibatch by words" strategy.

size (int or Sequence[int]): The target number of words per batch.
    Can be a single integer, or a sequence, allowing for variable batch sizes.
tolerance (float): What percentage of the size to allow batches to exceed.
discard_oversize (bool): Whether to discard sequences that by themselves
    exceed the tolerated size.
get_length (Callable or None): Function to get the length of a sequence
    item. The `len` function is used by default.
r   )r   r   r   )r   minibatch_by_words)r   r   r   r   r   s        r   configure_minibatch_by_wordsr   8   s=    " /9.Dz*"I)	
  r   c                 8    Ub  SU0O0 n[        [        4SU 0UD6$ )zCreate a batcher that creates batches of the specified size.

size (int or Sequence[int]): The target number of items per batch.
    Can be a single integer, or a sequence, allowing for variable batch sizes.
r   r   )r   r   )r   r   r   s      r   configure_minibatchr!   S   s*     /9.Dz*"I9545955r      Fseqsc              #   ~  #    [        U[        5      (       a  [        R                  " U5      nO[	        U5      n[        XS9 Hp  n[        U5      n[        U5      n[        XgU5       HG  nU V	s/ s H  oU	   PM	     n
n	[        S U
 5       5      [        U
5      -  nU(       a  X:  a  MC  U
v   MI     Mr     gs  sn	f 7f)a=  Minibatch a sequence by the size of padded batches that would result,
with sequences binned by length within a window.

The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch.

size (int or Sequence[int]): The largest padded size to batch sequences into.
buffer (int): The number of sequences to accumulate before sorting by length.
    A larger buffer will result in more even sizing, but if the buffer is
    very large, the iteration order will be less random, which can result
    in suboptimal training.
discard_oversize (bool): Whether to discard sequences that are by themselves
    longer than the largest padded batch size.
get_length (Callable or None): Function to get the length of a sequence item.
    The `len` function is used by default.
)r   c              3   8   #    U  H  n[        U5      v   M     g 7fNlen).0seqs     r   	<genexpr>+minibatch_by_padded_size.<locals>.<genexpr>   s     ;(3c#hh(   N)
isinstanceint	itertoolsrepeatiterr   listnext_batch_by_lengthmaxr(   )r#   r   r   r   r   size_outer_batchtarget_sizeindicesisubbatchpadded_sizes               r   r   r   _   s     . $  &T
 3;'5k'*MG0781AH8;(;;c(mKKK$> N 4 9s   A-B=/B8=A B=g?c              #     #    [        U[        5      (       a  [        R                  " U5      nO[	        U5      n[        U5      nXb-  n/ n/ n	Sn
SnU  H  nU" U5      nXU-   :  a  U(       d  U/v   M!  M#  US:X  a  X-   U::  a  UR                  U5        X-  n
MH  X-   U-   Xg-   ::  a  U	R                  U5        X-  nMl  U(       a  Uv   [        U5      nXb-  nU	nUn
/ n	SnX-   U::  a  UR                  U5        X-  n
M  X-   Xg-   ::  a  U	R                  U5        X-  nM  U(       a  Uv   [        U5      nXb-  nU/nUn
M     UR                  U	5        U(       a  Uv   gg7f)a  Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by
themselves, or be discarded if discard_oversize=True.

seqs (Iterable[Sequence]): The sequences to minibatch.
size (int or Sequence[int]): The target number of words per batch.
    Can be a single integer, or a sequence, allowing for variable batch sizes.
tolerance (float): What percentage of the size to allow batches to exceed.
discard_oversize (bool): Whether to discard sequences that by themselves
    exceed the tolerated size.
get_length (Callable or None): Function to get the length of a sequence
    item. The `len` function is used by default.
r   N)r.   r/   r0   r1   r2   r4   appendextend)r#   r   r   r   r   r7   r9   tol_sizebatchoverflow
batch_sizeoverflow_sizer*   n_wordss                 r   r   r      s    ( $  &T
u+K&HEHJMS/ 8++#e $ aZ%9k$ILL!J(728NOOOC $M u+K".HE&JHM$4S!%
&K,BC$( K"5k&2$
Q R 
LL s   E E"	max_wordsc                 .   [        U 5       VVs/ s H  u  p4U" U5      U4PM     nnnUR                  5         / n/ nU H[  u  pU(       d  UR                  U5        M  U[        U5      S-   -  U::  a  UR                  U5        MG  UR                  U5        U/nM]     U(       a  UR                  U5        [	        S U 5       5      [        U 5      :X  d   eU Vs/ s H  n[        [        U5      5      PM     nnUR                  5         U$ s  snnf s  snf )zGiven a list of sequences, return a batched list of indices into the
list, where the batches are grouped by length, in descending order.

Batches may be at most max_words in size, defined as max sequence length * size.
   c              3   8   #    U  H  n[        U5      v   M     g 7fr&   r'   )r)   bs     r   r+   #_batch_by_length.<locals>.<genexpr>   s     'w!s1vvwr-   )	enumeratesortr?   r(   sumr3   sortedreverse)	r#   rG   r   r;   r*   lengths_indicesbatchesrB   lengths	            r   r5   r5      s     ;DD/J/
3+/OJGE$	LLOs5zA~&)3LLONN5!CE % u'w''3t9444078utF5M"G8OON% K  9s   DDr&   )r0   	functoolsr   typingr   r   r   r   r   r	   r
   r   r   utilr   r   r/   Sizingr   BatcherTboolr   floatr   r!   r(   r   r   r5    r   r   <module>r]      s    
 
 
 '	x}c!	"Xe_%xU'<<= 48
  	
 5'3,/0 N 48
  	
 5'3,/0 8 BF	6
	6&x'=>	6	6 "$
5/$
$ $ 	$
 $ d5k$T I
5/I
I d5kIZ 58
3-$'	$s)_r   