
    >'hH                        d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z' d dl(m)Z)  G d de'      Z* G d d      Z+y)    N)defaultdict)deepcopy)get_all_start_methods)OptionalUnionIterableAnyTypeget_args)	BaseModel)models)INFERENCE_OBJECT_TYPES)InspectorEmbed)Embedder)NumericVectorNumericVectorStruct)ModelSchemaParser)	FieldPath)SUPPORTED_EMBEDDING_MODELS!SUPPORTED_SPARSE_EMBEDDING_MODELS"_LATE_INTERACTION_EMBEDDING_MODELS_IMAGE_EMBEDDING_MODELS-_LATE_INTERACTION_MULTIMODAL_EMBEDDING_MODELS)ParallelWorkerPoolWorker)
iter_batchc                   j    e Zd ZdedefdZedededd fd       Zdee	eef      dee	eef      fdZ
y)	ModelEmbedderWorker
batch_sizekwargsc                 2    t        di || _        || _        y )N )ModelEmbeddermodel_embedderr   )selfr   r    s      i/home/kushmeetdev/Regenta/Chatbot/venv/lib/python3.12/site-packages/qdrant_client/embed/model_embedder.py__init__zModelEmbedderWorker.__init__   s    +5f5$    returnc                      | dd|d|S )N   )threadsr   r"   r"   )clsr   r    s      r&   startzModelEmbedderWorker.start    s    >1>v>>r(   itemsc              #      K   |D ]9  \  }}|t        | j                  j                  || j                              f ; y w)Ninference_batch_size)listr$   embed_models_batchr   )r%   r/   idxbatchs       r&   processzModelEmbedderWorker.process$   sP      	JC''::DOO ;  	s   A AN)__name__
__module____qualname__intr	   r'   classmethodr.   r   tupler7   r"   r(   r&   r   r      sl    %3 %# % ?s ?c ?6K ? ?	XeCHo6 	8E#s(O;T 	r(   r   c                   <   e Zd ZdZddee   defdZ	 	 ddee	e
e	   f   deded	e
e	   fd
Z	 	 dde
eeee	f   e	f      dedee   d	e
eeee	f   e	f      fdZ	 	 ddeeeee	f   e	f      deded	e
e	   fdZ	 	 	 	 ddeeee	f   e	f   deee      dededee   d	eeee	f   eeef   e	ef   fdZdej.                  d	dfdZ	 ddej.                  deded	efdZddeded	dfdZded	efdZedej.                  d	ej.                  fd       Zed	e e!   fd       Z"y) r#   @   Nparserr    c                 b    i | _         i | _        t        |      | _        t	        di || _        y )N)r@   r"   )_batch_accumulator_embed_storager   _embed_inspectorr   embedder)r%   r@   r    s      r&   r'   zModelEmbedder.__init__3   s.    KM>@ .f = *6*r(   
raw_modelsis_queryr   r)   c              #      K   t        |t              r|g}t        ||      D ]  }| j                  |||      E d{      y7 w)a2  Embed raw data fields in models and return models with vectors

            If any of model fields required inference, a deepcopy of a model with computed embeddings is returned,
            otherwise returns original models.
        Args:
            raw_models: Iterable[BaseModel] - models which can contain fields with raw data
            is_query: bool - flag to determine which embed method to use. Defaults to False.
            batch_size: int - batch size for inference
        Returns:
            list[BaseModel]: models with embedded fields
        r1   N)
isinstancer   r   r4   )r%   rF   rG   r   raw_models_batchs        r&   embed_modelszModelEmbedder.embed_models9   sY     " j),$J *:z B 	.. ( /   	s   :AAAparallelc              #     K   d}t        |t              rt        |      |k  rd}||dk(  s|r-t        ||      D ]  }| j	                  ||      E d{     yd}t        ||      }|dk(  rt        j                         }dt               v rdnd	}|J t        || j                         || j                  
      }	|	j                  ||      D ]  }|E d{     y7 7 	w)a1  Embed raw data fields in models and return models with vectors

        Requires every input sequences element to contain raw data fields to inference.
        Does not accept ready vectors.

        Args:
            raw_models: Iterable[BaseModel] - models which contain fields with raw data to inference
            batch_size: int - batch size for inference
            parallel: int - number of parallel processes to use. Defaults to None.

        Returns:
            Iterable[Union[dict[str, BaseModel], BaseModel]]: models with embedded fields
        FTNr+   r1   )sizer   
forkserverspawn)num_workersworkerstart_methodmax_internal_batch_size)r   )rI   r3   lenr   r4   os	cpu_countr   r   _get_worker_classMAX_INTERNAL_BATCH_SIZEordered_map)
r%   rF   r   rL   is_smallr6   multiprocessing_batch_sizeraw_models_batchesrS   pools
             r&   embed_models_strictz!ModelEmbedder.embed_models_strictQ   s    & j$':+x1}#J
; [225z2ZZZ[ *+&!+J=W!X1}<<>+7;P;R+R<X_L'''%$--/)(,(D(D	D ))"/I *  ! !  !# [( !s%   AC%C!BC%C#C%#C%r2   c              #       K   |D ]  } j                  |d         j                  s|E d{    y fd|D        E d{    y7 7 w)af  Embed a batch of models with raw data fields and return models with vectors

            If any of model fields required inference, a deepcopy of a model with computed embeddings is returned,
            otherwise returns original models.
        Args:
            raw_models: list[Union[dict[str, BaseModel], BaseModel]] - models which can contain fields with raw data
            is_query: bool - flag to determine which embed method to use. Defaults to False.
            inference_batch_size: int - batch size for inference
        Returns:
            Iterable[BaseModel]: models with embedded fields
        T)rG   accumulatingNc              3   H   K   | ]  }j                  |d         yw)FrG   ra   r2   N)_process_model).0	raw_modelr2   rG   r%   s     r&   	<genexpr>z3ModelEmbedder.embed_models_batch.<locals>.<genexpr>   s7        ##%!&)=	 $ s   ")rd   rB   )r%   rF   rG   r2   rf   s   ` `` r&   r4   z ModelEmbedder.embed_models_batch   sg     " $ 	QI	H4P	Q &&!!! ",   "s!   -AAAA	AAmodelpathsra   c           
         t        |t        t                    r1|r| j                  |       n|J d       | j	                  |||      S ||st        |      n|}t        |t              rJ|j                         D ]5  \  }}|r| j                  ||d       | j                  |||d|      ||<   7 |S ||n| j                  j                  |      }|D ]  }t        |t              s|gn|}	|	D ]  }
t        |
|j                  d      }||j                  r!| j                  ||j                  |||       Jt        |t              }|r|n|g}|s_|J d       |D cg c]  }| j	                  |||       }}|rt        |
|j                  |       t        |
|j                  |d          |D ]  }| j                  |          |S c c}w )	ab  Embed model's fields requiring inference

        Args:
            model: Qdrant http model containing fields to embed
            paths: Path to fields to embed. E.g. [FieldPath(current="recommend", tail=[FieldPath(current="negative", tail=None)])]
            is_query: Flag to determine which embed method to use. Defaults to False.
            accumulating: Flag to determine if we are accumulating models for batch embedding. Defaults to False.
            inference_batch_size: Optional[int] - batch size for inference

        Returns:
            A deepcopy of the method with embedded fields
        Nz3inference_batch_size should be passed for inferencerG   r2   T)ra   Frc   r   )rI   r   r   _accumulate_drain_accumulatorr   dictr/   rd   rD   inspectr3   getattrcurrenttailsetattr)r%   rh   ri   rG   ra   r2   keyvaluepath
list_modelitemcurrent_modelwas_listdata
embeddingss                  r&   rd   zModelEmbedder._process_model   s3   * eX&<=>  ' )4IHI4..%)= /   =+7HUOUEeT"#kkm 

U''u4'H!%!4!4!)%*-A "5 "E#J	
 L*0E0E0M0Me0T "	3D(25$(?%UJ"  3 'dllD A (99''%		!)%1-A (   *->H5=MM?M'0<QPQ< )6	& !% !33 $xNb 4 &
 & $#D$,,
C#D$,,
1F$1 3D ,,T23? 3"	3F &s   !Gr{   c                     t        |t              r'|j                         D ]  }| j                  |        yt        |t              r3|D ].  }t        |t        t                    s y| j                  |       0 t        |t        t                    sy| j                  |      }|j                  | j                  vrg | j                  |j                  <   | j                  |j                     j                  |       y)aH  Add data to batch accumulator

        Args:
            data: models.VectorStruct - any vector struct data, if inference object types instances in `data` - add them
                to the accumulator, otherwise - do nothing. `InferenceObject` instances are converted to proper types.

        Returns:
            None
        N)rI   rn   valuesrl   r3   r   r   _resolve_inference_objectrh   rB   append)r%   r{   ru   s      r&   rl   zModelEmbedder._accumulate   s     dD! (  '(dD! (!%2H)IJ  '(
 $)? @A--d3::T44424D##DJJ/

+2248r(   c                 ,   t        |t              r1|j                         D ]  \  }}| j                  |||      ||<    |S t        |t              rIt        |      D ]9  \  }}t        |t        t                    s|c S | j                  |||      ||<   ; |S t        |t        t                    s|S | j                  r&| j                  j                  |j                  d      s| j                  ||       | j                  |j                        S )at  Drain accumulator and replaces inference objects with computed embeddings
            It is assumed objects are traversed in the same order as they were added to the accumulator

        Args:
            data: models.VectorStruct - any vector struct data, if inference object types instances in `data` - replace
                them with computed embeddings. If embeddings haven't yet been computed - compute them and then replace
                inference objects.
            inference_batch_size: int - batch size for inference

        Returns:
            NumericVectorStruct: data with replaced inference objects
        rk   N)rI   rn   r/   rm   r3   	enumerater   r   rC   getrh   _embed_accumulator_next_embed)r%   r{   rG   r2   rt   ru   is          r&   rm   z ModelEmbedder._drain_accumulator  s    dD!"jjl 
U 33HCW 4 S	 KdD!%dO 5!%2H)IJK11HCW 2 Q	 K(12
 K""$*=*=*A*A$**d*S##XL`#a

++r(   c           	          dt         t           dt        dt        dt         t           f fd} j
                  D ]3  }|g t        t        t        t        t        vs't        | d        j
                  j                         D ]  \  }} ||||       j                  |<     j
                  j                          y)	a  Embed all accumulated objects for all models

        Args:
            is_query: bool - flag to determine which embed method to use. Defaults to False.
            inference_batch_size: int - batch size for inference
        Returns:
            None
        objects
model_namer   r)   c                    g }g }g }t        t              }t        |       D ]  \  }}t        |t        j
                        }	t        t        ||            D ]_  \  }
\  }}||j                  k(  s|	|k(  s||
   j                  |       ||
   j                  |	r|j                  n|j                           |g|t        |      <   |j                  |j                         |j                  |	       |j                  |	r|j                  n|j                  g        g }t        t        ||            D ]X  \  }\  }}	|j                  j                  j                  ||	r||   nd|	s||   nd|xs i |      D cg c]  }| c}       Z t        |      }g gt        |       z  }|j!                         D ]  }|D ]  }t#        |      ||<     |S c c}w )z
            Assemble batches by options and data type based groups, embeds and return embeddings in the original order
            N)r   textsimagesrG   optionsr   )r   r3   r   rI   r   Documentzipr   r   textimagerU   extendrE   embediterr~   next)r   r   r   unique_optionsunique_options_is_textbatchesgroup_indicesr   objis_textjr   options_is_textr|   	embeddingiter_embeddingsordered_embeddingsindicesindexrG   r%   s                      r&   r   z/ModelEmbedder._embed_accumulator.<locals>.embedN  s    46N13"!#G2=d2CM#G, I3$S&//:5>(>?6 I1A1 #++-'_2L%a(//2
))g#((399MI ;<M#n"56"))#++6*11':NNCHHSYY#GHI  J)23~G]3^)_ %%GW!! *.)<)<'107'!*T5<71:$%-$+Mr'1 *= *
% "
 #:.O=?D3w<<O(//1 F$ FE04_0E&u-FF &%%
s    	G
 is not among supported models)r   r   r   N)r3   r   strr;   r   rB   r   r   r   r   r   
ValueErrorr/   rC   clear)r%   rG   r2   r   rh   r{   s   ``    r&   r   z ModelEmbedder._embed_accumulatorD  s    0	&010	&?B0	&PS0	&- 0	&d ,, 	KE +2 4 )	
 ?  !E7*H!IJJ	K  2288: 	KE4).;O*D&	 	%%'r(   r   c                 >    | j                   |   j                  d      S )zGet next computed embedding from embedded batch

        Args:
            model_name: str - retrieve embedding from the storage by this model name

        Returns:
            NumericVector: computed embedding
        r   )rC   pop)r%   r   s     r&   r   zModelEmbedder._next_embed  s      "":.22155r(   c                 h   t        | t        j                        s| S | j                  }| j                  }| j
                  }|g t        t        t        v rt        j                  |||      S |t        v rt        j                  |||      S |t        v rt        | d      t        | d      )a  Resolve inference object into a model

        Args:
            data: models.VectorStruct - data to resolve, if it's an inference object, convert it to a proper type,
                otherwise - keep unchanged

        Returns:
            models.VectorStruct: resolved data
        )rh   r   r   )rh   r   r   z- does not support `InferenceObject` interfacer   )rI   r   InferenceObjectrh   objectr   r   r   r   r   r   Imager   r   )r{   r   ru   r   s       r&   r   z'ModelEmbedder._resolve_inference_object  s     $ 6 67KZZ
,, 
'
.
 0
 

 ??%QQ00<<jwOOFF
|+XYZZJ<'EFGGr(   c                     t         S N)r   )r-   s    r&   rX   zModelEmbedder._get_worker_class  s    ""r(   r   )F   )r   N)NFFN)r   )#r8   r9   r:   rY   r   r   r	   r'   r   r   r   boolr;   rK   rn   r   r_   r3   r4   r   r   rd   r   VectorStructrl   r   rm   r   r   staticmethodr   r<   r
   r   rX   r"   r(   r&   r#   r#   0   s    +x(9: +S + 	)Xi%889  	
 
)	6 "&	/!U4Y#7#BCD/! /! 3-	/!
 
%S)^,i78	9/!h $%	tCN3Y>?@  "	
 
)	H ,0".2XT#y.)945X Y(X 	X
 X 'smX 
tCN#T#}*<%=y-W	XXt9 3 3 9 9< VW(,''(,37(,OR(,	(,TJ(4 J(s J([_ J(X	6c 	6m 	6 H(;(; H@S@S H H< #$':"; # #r(   r#   ),rV   collectionsr   copyr   multiprocessingr   typingr   r   r   r	   r
   r   pydanticr   qdrant_client.httpr   qdrant_client.embed.commonr   #qdrant_client.embed.embed_inspectorr   qdrant_client.embed.embedderr   qdrant_client.embed.modelsr   r   !qdrant_client.embed.schema_parserr   qdrant_client.embed.utilsr   qdrant_client.fastembed_commonr   r   r   r   r    qdrant_client.parallel_processorr   r   qdrant_client.uploader.uploaderr   r   r#   r"   r(   r&   <module>r      sZ    	 #  1 A A  % = > 1 I ? /  H 6& *L# L#r(   