
    hX                       S SK Jr  S SKrS SKrS SKJrJr  S SKrS SK	r	S SK
J
r
  \R                  " \5      r\(       a   S SKJr   S SKJr   S SKJr   S SKJr      S             SS	 jjr    S             SS
 jjr      S               SS jjr    S             SS jjrg! \ a     Nf = f! \ a     Nf = f! \ a     Nf = f! \ a     Nf = f)    )annotationsN)TYPE_CHECKINGAny)tqdm)ElasticsearchQdrantClient)SeismicIndex)
OpenSearchc                	    SSK Jn  SSKJn  U R
                  (       a  U R                  [        R                  :w  a  [        S5      eUGc=  Uc  [        S5      eUR
                  (       a  UR                  [        R                  :w  a  [        S5      eU" SSS	0UD6nS
[        [        R                  " 5       5       3n	UR                  U	 0 SUR                  UR                  SS9S90S9  UR                  5       n
U
R!                  5       R#                  5       R%                  5       nU
R'                  5       R#                  5       R%                  5       nUR)                  S5      nSn/ nSnUS   n[*        R,                  " U[*        R.                  " U5      SS9n[*        R,                  " U[*        R.                  " U5      SS9n[1        [3        U5      SS9 H  nUU   nUU   nUS   UU R5                  5       nUUU R5                  5       nSUR7                  UUS90nUR9                  U5        [;        U5      U:  d  UUS-
  :X  d  Ms  UR=                  U	U[3        UU[;        U5      -   5      S9  U[;        U5      -  n/ nM     X4nUu  p/ n[        R                  " 5       n[3        U R)                  S5      5       GH  nU R?                  5       S:X  a  U R                  5       R!                  5       S   R#                  5       R%                  5       R5                  5       nU R                  5       R'                  5       R#                  5       R%                  5       R5                  5       nOU R                  5       R!                  5       S   R#                  5       R%                  5       U:H  nU R                  5       R!                  5       S   U   R#                  5       R%                  5       R5                  5       nU R                  5       R'                  5       U   R#                  5       R%                  5       R5                  5       nURA                  U	UR7                  UUS9USS9RB                  n U  V!s/ s H  n!U!RD                  U!RF                  S.PM     n"n!UR9                  U"5        GM     [        R                  " 5       U-
  n#U(       a  UU#U4$ UU#4$ ! [         a    [	        S5      ef = fs  sn!f )a  
Performs semantic search using sparse embeddings with Qdrant.

Args:
    query_embeddings: PyTorch COO sparse tensor containing query embeddings
    corpus_embeddings: PyTorch COO sparse tensor containing corpus embeddings
        Only used if corpus_index is None
    corpus_index: Tuple of (QdrantClient, collection_name)
        If provided, uses this existing index for search
    top_k: Number of top results to retrieve
    output_index: Whether to return the Qdrant client and collection name

Returns:
    A tuple containing:
    - List of search results in format [[{"corpus_id": int, "score": float}, ...], ...]
    - Time taken for search
    - (Optional) Tuple of (QdrantClient, collection_name) if output_index is True
r   r   )modelszWPlease install the Qdrant client with `pip install qdrant-client` to use this function.z,Query embeddings must be a sparse COO tensorz9Either corpus_embeddings or corpus_index must be providedz-Corpus embeddings must be a sparse COO tensorurlzhttp://localhost:6333sparse_collection_textF)on_diskindex)collection_namevectors_configsparse_vectors_configi'  left)siderightz#Processing and Upserting embeddingsdesc   )indicesvalues)r   vectorsids)r   querylimitusing	corpus_idscore )$qdrant_clientr	   qdrant_client.httpr   ImportError	is_sparselayouttorch
sparse_coo
ValueErrorinttimecreate_collectionSparseVectorParamsSparseIndexParamscoalescer   cpunumpyr   sizenpsearchsortedaranger   rangetolistSparseVectorappendlenupload_collection
sparse_dimquery_pointspointsidr&   )$query_embeddingscorpus_embeddingscorpus_indextop_koutput_indexkwargsr	   r   clientr   corpusindices_arr
values_arrnum_vectors
batch_sizevectors_batch
insert_idxrow_idsstartsendsistartendvec_indices
vec_valuesvector_dataall_resultssearch_start_timeq_idx	q_indicesq_valuesmasksearch_resultshitformatted_resultssearch_times$                                       m/home/james-whalen/.local/lib/python3.13/site-packages/sentence_transformers/sparse_encoder/search_engines.pysemantic_search_qdrantrh       s   :u.-
 %%)9)@)@EDTDT)TGHH$XYY **.?.F.F%JZJZ.ZLMM D"9DVD.s499;/?.@A  ./#)6+D+D6KcKclqKcKr+D+s"t 	! 	
 #++-nn&**,224]]_((*002
',,Q/

 a."))K*@vNw		+(>WMeK(/TUA1IEq'C%a.s3::<K#E#.557J!6#6#6{S]#6#^_K  -=!Z/1a3G(($3)j*s=7I*IJ ) 
 c-00
 "! V$ 0 +F K		 ',,Q/0&&(A-(113;;=a@DDFLLNUUWI'00299;??AGGIPPRH#,,.668;??AGGIURD(113;;=a@FJJLRRT[[]I'00299;DAEEGMMOVVXH  ,,+%%i%I	 - 

 & 	 SaaR`3366CIIFR`a,-) 1, ))+ 11KK55K''{  usttuj bs   S 1#S-S*c           
         SSK JnJn  [	        U [
        5      (       a  [        S U  5       5      (       d  [        S5      eUGc  Uc  [        S5      e[	        U[
        5      (       a  [        S U 5       5      (       d  [        S5      eU" S$0 UD6nS	[        [        R                  " 5       5       3n	UR                  R                  U	S
9(       a  UR                  R                  U	S
9  UR                  R                  U	SSSS0SS0S.00S9  [        U5      n
Sn[        [!        SX5      SS9 H  n[#        X-   U
5      n/ n[!        X5       Hy  n[%        X   5      nUR'                  5        VVs0 s H"  u  nn[)        U5      R+                  SS5      U_M$     nnnUR-                  U	[)        U5      [)        U5      US.S.5        M{     UR/                  X5        M     UR                  R1                  U	S
9  X4nUu  p/ n[        R                  " 5       n[!        [        U 5      5       H  n[%        U U   5      nUR'                  5        VVs0 s H"  u  nn[)        U5      R+                  SS5      U_M$     nnn/ nUR'                  5        H   u  nnUR-                  SSU 30 US.05        M"     USUSS.0S.nUR3                  U	US9nUS    S     Vs/ s H  n[        US!   5      US"   S#.PM     nnUR-                  U5        M     [        R                  " 5       U-
  nU(       a  UUU4$ UU4$ ! [         a    [        S5      ef = fs  snnf s  snnf s  snf )%a  
Performs semantic search using sparse embeddings with Elasticsearch.

Args:
    query_embeddings_decoded: List of query embeddings in format [[("token": value), ...], ...]
        Example: To get this format from a SparseEncoder model::

            model = SparseEncoder('my-sparse-model')
            query_texts = ["your query text"]
            query_embeddings = model.encode(query_texts)
            query_embeddings_decoded = model.decode(query_embeddings)
    corpus_embeddings_decoded: List of corpus embeddings in format [[("token": value), ...], ...]
        Only used if corpus_index is None
        Can be obtained using the same decode method as query embeddings
    corpus_index: Tuple of (Elasticsearch, collection_name)
        If provided, uses this existing index for search
    top_k: Number of top results to retrieve
    output_index: Whether to return the Elasticsearch client and collection name

Returns:
    A tuple containing:
    - List of search results in format [[{"corpus_id": int, "score": float}, ...], ...]
    - Time taken for search
    - (Optional) Tuple of (Elasticsearch, collection_name) if output_index is True
r   )r   helpersz^Please install the Elasticsearch client with `pip install elasticsearch` to use this function.c              3  t   #    U  H.  n[        U[        5      =(       a    [        S  U 5       5      v   M0     g7f)c              3  l   #    U  H*  n[        U[        5      =(       a    [        U5      S :H  v   M,     g7f   N
isinstancetupler@   .0ts     rg   	<genexpr>:semantic_search_elasticsearch.<locals>.<genexpr>.<genexpr>   *     &ZUYPQz!U';'KA!'KUY   24Nrp   listallrs   items     rg   ru   0semantic_search_elasticsearch.<locals>.<genexpr>   3      A,D 	4Z3&ZUY&Z#ZZ,   68UQuery embeddings must be a list of lists in the format [[('token', value), ...], ...]AEither corpus_embeddings_decoded or corpus_index must be providedc              3  t   #    U  H.  n[        U[        5      =(       a    [        S  U 5       5      v   M0     g7f)c              3  l   #    U  H*  n[        U[        5      =(       a    [        U5      S :H  v   M,     g7frm   ro   rr   s     rg   ru   rv      *     *^Y]TU:a+?+OCFaK+OY]rx   Nry   r|   s     rg   ru   r~      3      F
1 tT"^s*^Y]*^'^^1r   VCorpus embeddings must be a list of lists in the format [[('token', value), ...], ...]sparse_index_r   mappings
propertiestyperank_featureskeywordtokensrE   r   body  Upserting embeddingsr   ._rE   r   _index_id_sourcerank_featureztokens.)field
saturationboostboolr   )shouldminimum_should_matchr8   r!   hitsr   _scorer$   zhttp://localhost:9200)elasticsearchr   rj   r*   rp   rz   r{   r/   r0   r1   r   existsdeletecreater@   r   r<   mindictitemsstrreplacer?   bulkrefreshsearch)query_embeddings_decodedcorpus_embeddings_decodedrH   rI   rJ   rK   r   rj   es
index_namenum_docsrQ   	start_idxend_idxactionsrW   r   kvr]   r^   r_   query_tokensshould_clausestokenweightr!   resultrd   	formattedrf   s                                  rg   semantic_search_elasticsearchr      s   H
8 .55S A,A > > pqq$,`aa3T::# F
1F
 C
 C
 uvv=f=$S%5$67
:::.JJJ/


 "O#  &y1	#	 	 	
 01
eAx<CYZI)0(;GG9.7:;<BLLN<JDAqCFNN3,a/N   ","1v"%a&&,$	 /$ LL%- [0 	

,'!NBK		s3454U;<4@4F4F4H
4HDAqCFNN3$a'4H 	 

 )//1ME6!!>wug=N^`kq3r"st 2&^ef2g)hi%8 X^^dWeflWmnWmPS3s5z?S]KWm	n9% 6" ))+ 11KK55K''E  
l
 	

f6
 os   L2 1)M
)MM2Mc                     SSK JnJnJn	  Uc  0 nUc  0 nU	" 5       n
[        U [        5      (       a  [        S U  5       5      (       d  [        S5      eUGc  Uc  [        S5      e[        U[        5      (       a  [        S U 5       5      (       d  [        S5      eU" 5       n[        U5      n[        [        U5      S	S
9 H  n[        X   5      nUR                  [        U5      [        R                   " [        UR#                  5       5      U
S9[        R                   " [        UR%                  5       5      [        R&                  S95        M     UR(                  " U40 UD6n[*        R*                  " 5       n[        U 5      n/ n/ n[        U5       H  n[        U U   5      nUR-                  [        R                   " [        UR#                  5       5      U
S95        UR-                  [        R                   " [        UR%                  5       5      [        R&                  S95        M     SU;  a  SUS'   SU;  a  SUS'   UR.                  " S[        R                   " [        U5      U
S9UUUS.UD6n[1        US S9nU VVVVs/ s H*  nU VVVs/ s H  u  nnn[3        U5      US.PM     snnnPM,     nnnnn[*        R*                  " 5       U-
  nU(       a  UUU4$ UU4$ ! [         a    [	        S5      ef = fs  snnnf s  snnnnf )a8  
Performs semantic search using sparse embeddings with Seismic.

Args:
    query_embeddings_decoded: List of query embeddings in format [[("token": value), ...], ...]
        Example: To get this format from a SparseEncoder model::

            model = SparseEncoder('my-sparse-model')
            query_texts = ["your query text"]
            query_embeddings = model.encode(query_texts)
            query_embeddings_decoded = model.decode(query_embeddings)
    corpus_embeddings_decoded: List of corpus embeddings in format [[("token": value), ...], ...]
        Only used if corpus_index is None
        Can be obtained using the same decode method as query embeddings
    corpus_index: Tuple of (SeismicIndex, collection_name)
        If provided, uses this existing index for search
    top_k: Number of top results to retrieve
    output_index: Whether to return the SeismicIndex client and collection name
    index_kwargs: Additional arguments for SeismicIndex passed to build_from_dataset,
        such as centroid_fraction, min_cluster_size, summary_energy, nknn, knn_path,
        batched_indexing, or num_threads.
    search_kwargs: Additional arguments for SeismicIndex passed to batch_search,
        such as query_cut, heap_factor, n_knn, sorted, or num_threads.
        Note: query_cut and heap_factor are set to default values if not provided.
Returns:
    A tuple containing:
    - List of search results in format [[{"corpus_id": int, "score": float}, ...], ...]
    - Time taken for search
    - (Optional) Tuple of (SeismicIndex, collection_name) if output_index is True
r   )SeismicDatasetr
   get_seismic_stringzMPlease install Seismic with `pip install pyseismic-lsr` to use this function.c              3  t   #    U  H.  n[        U[        5      =(       a    [        S  U 5       5      v   M0     g7f)c              3  l   #    U  H*  n[        U[        5      =(       a    [        U5      S :H  v   M,     g7frm   ro   rr   s     rg   ru   4semantic_search_seismic.<locals>.<genexpr>.<genexpr>c  rw   rx   Nry   r|   s     rg   ru   *semantic_search_seismic.<locals>.<genexpr>b  r   r   r   r   c              3  t   #    U  H.  n[        U[        5      =(       a    [        S  U 5       5      v   M0     g7f)c              3  l   #    U  H*  n[        U[        5      =(       a    [        U5      S :H  v   M,     g7frm   ro   rr   s     rg   ru   r   m  r   rx   Nry   r|   s     rg   ru   r   l  r   r   r   zAdding documents to Seismicr   )dtype	query_cut
   heap_factorgffffff?)queries_idsquery_componentsquery_valuesr   c                $    [        U S   S   5      $ )Nr   )r0   )xs    rg   <lambda>)semantic_search_seismic.<locals>.<lambda>  s    C!QL    )keyr$   r'   )seismicr   r
   r   r*   rp   rz   r{   r/   r@   r   r<   r   add_documentr   r9   arraykeysr   float32build_from_datasetr1   r?   batch_searchsortedr0   )r   r   rH   rI   rJ   index_kwargssearch_kwargsr   r
   r   string_typedatasetrP   idxr   r^   num_queriesr   r   r_   r   resultsquery_result	query_idxr&   r%   r]   rf   s                               rg   semantic_search_seismicr   +  s   TkLL $&K .55S A,A > > pqq$,`aa3T::# F
1F
 C
 C
 uvv !"34 k*1NOC389F  Cfkkm,K@fmmo.bjjA P $66wO,O		./KL {#4U;<l.?.?.A)B+ VWBHHT,*=*=*?%@

ST $
 -'%'k"M)'*m$'' HHU;/{C)!
	
 G W"89G
 $#L [ggZf;V9eYs9~	6Zfg#  
 ))+ 11KK55K''e  kijjkR 	hs#   
K( 
L
L2	L
(K>L
c           	     :    SSK JnJn  [	        U [
        5      (       a  [        S U  5       5      (       d  [        S5      eUGcx  Uc  [        S5      e[	        U[
        5      (       a  [        S U 5       5      (       d  [        S5      eU" S0 UD6nS	[        [        R                  " 5       5       3n	UR                  R                  U	S
9(       a  UR                  R                  U	S
9  UR                  R                  U	SSSS0SS0S.00S9  [        U5      n
Sn[        [!        SX5      SS9 Hm  n[#        X-   U
5      n/ n[!        X5       H:  n[%        X   5      nUR'                  U	[)        U5      [)        U5      US.S.5        M<     UR+                  X5        Mo     UR                  R-                  U	S
9  X4nUu  p/ n[        R                  " 5       n[!        [        U 5      5       Hg  n[%        U U   5      nUSSSU000S.nUR/                  U	US9nUS   S    Vs/ s H  n[        US   5      US   S.PM     nnUR'                  U5        Mi     [        R                  " 5       U-
  nU(       a  UUU4$ UU4$ ! [         a    [        S5      ef = fs  snf ) a  
Performs semantic search using sparse embeddings with OpenSearch.

Args:
    query_embeddings_decoded: List of query embeddings in format [[("token": value), ...], ...]
        Example: To get this format from a SparseEncoder model::

            model = SparseEncoder('my-sparse-model')
            query_texts = ["your query text"]
            query_embeddings = model.encode(query_texts)
            query_embeddings_decoded = model.decode(query_embeddings)
    corpus_embeddings_decoded: List of corpus embeddings in format [[("token": value), ...], ...]
        Only used if corpus_index is None
        Can be obtained using the same decode method as query embeddings
    corpus_index: Tuple of (OpenSearch, collection_name)
        If provided, uses this existing index for search
    top_k: Number of top results to retrieve
    output_index: Whether to return the OpenSearch client and collection name
    vocab: The dict to transform tokens into token ids

Returns:
    A tuple containing:
    - List of search results in format [[{"corpus_id": int, "score": float}, ...], ...]
    - Time taken for search
    - (Optional) Tuple of (OpenSearch, collection_name) if output_index is True
r   )r   rj   z[Please install the OpenSearch client with `pip install opensearch-py` to use this function.c              3  t   #    U  H.  n[        U[        5      =(       a    [        S  U 5       5      v   M0     g7f)c              3  l   #    U  H*  n[        U[        5      =(       a    [        U5      S :H  v   M,     g7frm   ro   rr   s     rg   ru   7semantic_search_opensearch.<locals>.<genexpr>.<genexpr>  rw   rx   Nry   r|   s     rg   ru   -semantic_search_opensearch.<locals>.<genexpr>  r   r   r   r   c              3  t   #    U  H.  n[        U[        5      =(       a    [        S  U 5       5      v   M0     g7f)c              3  l   #    U  H*  n[        U[        5      =(       a    [        U5      S :H  v   M,     g7frm   ro   rr   s     rg   ru   r     r   rx   Nry   r|   s     rg   ru   r     r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   neural_sparser   r   r   r   r   r   r$   r   )opensearchpyr   rj   r*   rp   rz   r{   r/   r0   r1   r   r   r   r   r@   r   r<   r   r   r?   r   r   r   r   )r   r   rH   rI   rJ   rK   r   rj   	os_clientr   r   rQ   r   r   r   rW   r   r]   r^   r_   r   r!   r   rd   r   rf   s                             rg   semantic_search_opensearchr     s   J
4 .55S A,A > > pqq$,`aa3T::# F
1F
 C
 C
 uvvA&A	$S%5$67
##*#5$$:$6   "O#  &y1	#	 	! 	
 01
eAx<CYZI)0(;GG9.7:;","1v"%a&&,$	 / LL,% [( 	!!
!3!.(IK		s3454U;</H~WcFd;e)fg!!
!? X^^dWeflWmnWmPS3s5z?S]KWm	n9% 6 ))+ 11KK55K''q  
i
 	

` os   I? "J?J)NNr   F)rF   ztorch.TensorrG   ztorch.Tensor | NonerH   ztuple[QdrantClient, str] | NonerI   r0   rJ   r   rK   r   returnz}tuple[list[list[dict[str, int | float]]], float] | tuple[list[list[dict[str, int | float]]], float, tuple[QdrantClient, str]])r   list[list[tuple[str, float]]]r   $list[list[tuple[str, float]]] | NonerH   z tuple[Elasticsearch, str] | NonerI   r0   rJ   r   rK   r   r   z~tuple[list[list[dict[str, int | float]]], float] | tuple[list[list[dict[str, int | float]]], float, tuple[Elasticsearch, str]])NNr   FNN)r   r   r   r   rH   ztuple[SeismicIndex, str] | NonerI   r0   rJ   r   r   dict[str, Any] | Noner   r   r   z}tuple[list[list[dict[str, int | float]]], float] | tuple[list[list[dict[str, int | float]]], float, tuple[SeismicIndex, str]])r   r   r   r   rH   ztuple[OpenSearch, str] | NonerI   r0   rJ   r   rK   r   r   z{tuple[list[list[dict[str, int | float]]], float] | tuple[list[list[dict[str, int | float]]], float, tuple[OpenSearch, str]])
__future__r   loggingr1   typingr   r   r7   r9   r-   r   	getLogger__name__loggerr   r   r*   r(   r	   r   r
   r   r   rh   r   r   r   r'   r   rg   <module>r      s-   "   %   			8	$/.(+ .248}("}(*}( 2}( 	}(
 }( }(Q}(D GK59H(;H(CH( 3H( 	H(
 H( H(RH(Z GK48*.+/~(;~(C~( 2~( 	~(
 ~( (~( )~(Q~(F GK26(;(C( 0( 	(
 ( (O(y        sG   B? C C C# ?CCCCC C #C,+C,