
    hM                     L   S SK r S SKrS SKJr  S SKJr  S SKJrJrJ	r	J
r
JrJrJrJrJrJrJr  S SKrS SKrS SKrS SKJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJ r   SSK!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,  SSK-J.r.J/r/  SSK0J1r1  SSKJ2r2J3r3  SSK4J5r6  SSK7J8r8J9r9J:r:J;r;J<r<J=r=J>r>J?r?  Sr@SrASrBSrCSrDSrESrF\<R                  " SSSS.S9\;R                  " SSSS.SS 9\8" S!S"SSS#9\9" SS$S%S&S'S(9\9" S)S*S+S,S(9\9" S)S-S.S/S(9\9" S)S0S1S2S(94S3\R                  S4\S5\\   S6\IS7\IS8\I4S9 jj5       5       rJ0 S)S)SSS:.S4\S;\\K\4   S6\IS7\IS8\IS<\I4S= jjrLS>\S?\S@S4SA jrMSB\\.   SC\
\K   SD\SE\IS@\\K\4   4
SF jrN\SlSG\	\K   SH\S)   S@\K4SI jj5       rO\SG\	\\K\P4      SH\S   S@\K4SJ j5       rO SlSG\\	\K   \	\\K\P4      4   SH\IS@\K4SK jjrOSmSL\\P\Q4   SM\IS@\K4SN jjrR  SnS\\.   SO\KSP\SQ   SR\\K   S@\P4
SS jjrSSD\ST\KS@\\K   4SU jrTSD\S@\\K\\K   4   4SV jrUSW\
S@\Q4SX jrVSY\\K\Q4   SZ\S@\Q4S[ jrWSmS\\IS@\4S] jjrXS^\S_\S@\Q4S` jrYSa\
\   SG\
\K   S@\
\   4Sb jrZSB\
\.   Sc\\K\4   SR\KS@\\K\4   4Sd jr[Se\\K\4   4Sf jr\\F4Sg\S@\\P\Q4   4Sh jjr]Si\\P\Q4   Sj\PS@\\P\Q4   4Sk jr^g)o    N)Counter)Path)AnyDictIterableListOptionalSequenceSetTupleUnioncastoverload)MESSAGESPrintermsg   )util)Literal)Language)
Morphology)MorphologizerSpanCategorizerTrainablePipe)	EditTrees)nonproj)	DELIMITER)ConfigSchemaTraining)Exampleremove_bilu_prefix)get_sourced_components)registryresolve_dot_names)Mode   )ArgOpt_format_numberapp	debug_cliimport_codeparse_config_overridesshow_validation_error2      d   i  Z   dataT)allow_extra_argsignore_unknown_options)context_settings
debug-data)r5   hidden.zPath to config file)helpexists
allow_dashz--code-pathz--codez-czNPath to Python file with additional code (registered functions) to be imported)r8   Fz--ignore-warningsz-IWz+Ignore warnings, only show stats and errorsz	--verbosez-Vz-Print additional information and explanationsz--no-formatz-NFzDon't pretty-print the resultsctxconfig_path	code_pathignore_warningsverbose	no_formatc           	          U R                   R                  S:X  a  [        R                  " S5        [	        U R
                  5      n[        U5        [        UUUUUSS9  g)z
Analyze, debug and validate your training and development data. Outputs
useful stats, and can help you find problems like invalid entity annotations,
cyclic dependencies, low data labels and more.

DOCS: https://spacy.io/api/cli#debug-data
r6   zThe debug-data command is now available via the 'debug data' subcommand (without the hyphen). You can run python -m spacy debug --help for an overview of the other available debugging commands.Fconfig_overridesr>   r?   r@   silentN)commandnamer   warnr,   argsr+   
debug_data)r;   r<   r=   r>   r?   r@   	overridess          N/home/james-whalen/.local/lib/python3.13/site-packages/spacy/cli/debug_data.pydebug_data_clirL   ?   sZ    2 {{<'P	

 'sxx0I	"'    rB   rC   rD   c                0  ^N^O [        XT(       + US9n[        U 5         [        R                  " XS9n[        R                  " U5      mNTNR
                  R                  5       n[        R                  " US   [        S9n	S S S 5        [        W5      n
W	S   nU
 Vs/ s H  oU;  d  M
  UPM     nnTNR                  nTNR                   Vs/ s H  nTNR                  U5      R                  PM      nnUR                  " S5        U	S   U	S   /n[        WU5      u  mOnTNR!                  UNUO4S	 j5        UR"                  " S
5        [%        TO" TN5      5      n[%        U" TN5      5      nUR"                  " S5        ['        UUTNSS9n['        UUTNSS9n['        UUTNSS9nUS   nUS   nU	S   nUR                  " S5        UR(                  " STNR*                   35        UR(                  " SSR-                  U5       35        U(       a$  UR(                  " SSR-                  U5       35        U(       a$  UR(                  " SSR-                  U5       35        UR(                  " [/        U5       S35        UR(                  " [/        U5       S35        [/        U5      (       d  UR0                  " S5        [/        UR3                  U5      5      nU(       a  UR4                  " U S35        OUR"                  " S5        U(       dz  [/        U5      [6        :  ag  S[/        U5       S3n[/        U5      [8        :  a  UR0                  " U5        OUR4                  " U5        UR(                  " S[6         S[8         S3US9  UR                  " S 5        US!   nUR:                  " U S"[/        US#   5       S$35        US%   S&:  a  US%   nUR4                  " U S'35        US%   S&:  a  US%   nUR4                  " U S(35        US#   R=                  S)5      nUR(                  " S*[?        USS+9 3US9  [/        TNR@                  RB                  5      (       Ga  TNR@                  RB                  RD                  [F        RH                  :X  a  UR:                  " S,[/        TNR@                  RB                  5       S-TNR@                  RJ                   S.TNR@                  RB                  RL                   S/TNR@                  RB                  RN                   S03	5        OUR:                  " [/        TNR@                  RB                  5       S1TNR@                  RB                  RP                   S2TNR@                  RJ                   S335        [S        US4   RU                  5       5      nUR4                  " S5RW                  US6UUS!   -  -  5      5        UR(                  " S7RW                  [?        US4   R=                  S)5      SS+95      US9  OUR:                  " S85        S9U;   d  S:U;   Ga+  [Y        TN5      n Sn!Sn"UR                  " S;5        URZ                  " U S<S=/SS>9  UR(                  " S?US9  US9   R]                  5        H3  u  n#n$UR(                  " S@U# S[?        U$R]                  5       SS+9 3US9  M5     U R_                  5        V#s0 s H  n#U#US9   U#   _M     n%n#U%R]                  5        GH  u  n#n$U$R]                  5        H  u  n&n'U#U R_                  5       ;   n(U((       a"  U&U U#   ;  a  UR4                  " SAU& SBU# SC35        U'[`        ::  a  UR4                  " SDU& SEU# SFU' S35        Sn!URb                  " SG5         [e        UU&S9U#5      n)S S S 5        W)S&:X  d  M  UR4                  " SHU& SI35        Sn"M     URb                  " SJ5         [g        UUU#5      n*S S S 5        UR:                  " SKU# SI35        UR:                  " SL5        [i        W*5        [k        USM   U#   5      n+[m        U+[n        SN9n,UR:                  " SO[n         SP[q        U,R_                  5       5       SQU*SR    SSU*ST    SU[s        U,5       SV35        UR(                  " SW[s        U+5       3US9  U*SX   [t        :  a  UR4                  " SY5        OUR"                  " SZ5        U*S[   RU                  5       n-[S        U-[w        5       5      n.U.R=                  S)5       V/V0s/ s H  u  n/n0U/PM
     n1n/n0UR(                  " S\RW                  [?        U15      5      US9  U*S]   [x        :  a  UR4                  " S^5        OUR"                  " S_5        U*S`   RU                  5       n2[S        U2[w        5       5      n3U3R=                  S)5       V/V0s/ s H  u  n/n0U/PM
     n4n/n0UR(                  " SaRW                  [?        U45      5      US9  GM     U!(       a  UR(                  " Sb[`         Sc3US9  OUR"                  " Sd5        U"(       a  UR(                  " SeUS9  OUR"                  " Sf5        SgU;   Ga  [{        Sh USg    5       5      n5USg   n6[}        TNSg5      n7Sn!Sn"Sn8Sn9UR                  " Si5        UR:                  " [/        U75       Sj35        U6S/   n:UR(                  " U: Sk35        U5 H&  n&[/        U&5      S&:X  d  M  UR0                  " Sl5        M(     U6R=                  5        V&V's/ s H  u  n&n'U&S/:w  d  M  U&U'4PM     n;n&n'[?        U;SS+9n;UR(                  " SmU; 3US9  U7U5-
  n<U<(       a  UR4                  " Sn[?        U<5       So35        USp   (       a  UR0                  " USp    Sq35        Sn8U5 Hx  n&U6U&   [`        ::  d  M  UR4                  " SDU& SFU6U&    S35        Sn!URb                  " SG5         [e        UU&Sg5      n)S S S 5        W)S&:X  d  M`  UR4                  " SHU& SI35        Sn"Mz     USr   (       a  UR4                  " USr    Ss35        Sn9U!(       d  UR"                  " Sd5        U"(       d  UR"                  " Sf5        U8(       d  UR"                  " St5        U9(       d  UR"                  " Su5        U!(       a  UR(                  " Sv[`         Sc3US9  U"(       a  UR(                  " SwUS9  U8(       a  UR(                  " Sx5        SyU;   Ga[  UR                  " Sz5        [}        TNSy5      n5UR:                  " S{[/        U55       Sj35        UR(                  " S|[?        U55       3US9  U5[{        US}   5      -
  n<U<(       a  UR4                  " Sn[?        U<5       So35        [{        US}   5      [{        US}   5      :w  a1  UR4                  " S~[?        US}   5       S[?        US}   5       So35        [/        U55      S:  a  UR0                  " S5        US   S&:  d	  US   S&:  a  UR0                  " S5        US   S&:  a  UR0                  " S5        US   S&:  a  UR0                  " S5        SU;   GaV  UR                  " S5        [}        TNS5      n5UR:                  " S{[/        U55       Sj35        UR(                  " S|[?        U55       3US9  U5[{        US}   5      -
  n<U<(       a  UR4                  " Sn[?        U<5       So35        [{        US}   5      [{        US}   5      :w  a1  UR4                  " S~[?        US}   5       S[?        US}   5       So35        US   S&:  d	  US   S&:  a  UR0                  " S5        US   S&:  a  US   S&:X  a  UR4                  " S5        O-UR4                  " S5        US   S&:  a  UR0                  " S5        SU;   Ga?  UR                  " S5        [        US   R]                  5       6 u  n=n>UR:                  " [/        U=5       S35        [        R                  " U>5      nXRS                  5       -  nU* [        R                  " U5      -  RS                  5       [        R                  " [/        U=5      5      -  n?UR:                  " U? S35        [}        TNS5      n7[{        U=5      n5U7U5-
  n<U<(       a  UR4                  " Sn[?        U<5       So35        [?        US   R=                  5       SS+9n;UR(                  " U;US9  SU;   a  UR                  " S5        US    V&s/ s H  n&U&PM     n=n&[}        TNS5      n7UR:                  " [/        U=5       S35        [{        U=5      n5U7U5-
  n<U<(       a  UR4                  " Sn[?        U<5       So35        [?        US   R=                  5       SS+9n;UR(                  " U;US9  SU;   Gat  Sn!UR                  " S5        UR:                  " SUS    SUS!   US   -  S S35        US   [/        US   5      -  n@U@S:  a  UR4                  " SW@S S35        US    V&s/ s H  n&U&PM     nAn&US    V&s/ s H  n&U&PM     nBn&US    V&s/ s H  n&U&PM     nCn&US   S&:  a  US   nDUR:                  " SUD S35        US   S&:  a  US   nDUR:                  " SUD S35        UR:                  " [/        WB5       S35        UR:                  " [/        WA5       S35        [?        US   R=                  5       SS+9n;UR(                  " U;US9  US    H6  n&US   U&   [        ::  d  M  UR4                  " SDU& SFUS   U&    S35        Sn!M8     / nEUS    H=  n&US   U&   [        ::  d  M  [        U&;   d  M!  WER                  U& SUS   U&    35        M?     [/        WE5      S&:  aB  UR4                  " S[/        WE5       S35        UR4                  " SSR-                  UE5      US9  Sn![{        WA5      [{        WC5      -
  (       a6  UR4                  " SSR-                  [{        WA5      [{        WC5      -
  5      US9  [{        WC5      [{        WA5      -
  (       a6  UR4                  " SSR-                  [{        WC5      [{        WA5      -
  5      US9  U!(       a  UR(                  " S[         S3US9  [/        US   5      S:  a(  UR4                  " SSR-                  US   5       S35        US   S&:  a  UR0                  " SUS    S35        US   S&:  a  UR0                  " SUS    S35        SU;   Ga  UR                  " S5        US   nFUS   nGUR:                  " [/        UF5       S35        UR:                  " [/        UG5       S35        UGUF-
  nH[/        UH5      S&:w  a=  [/        WH5      [/        WG5      -  nIUR:                  " [/        UH5       SUIS6-  S S35        OUR:                  " S5        US   S&:  a  US   nJUR4                  " UJ S35        US   S&:  a  US   nJUR4                  " UJ S35        US   S&:  a  US   nJUR4                  " UJ S35        OUR"                  " S5        US   S&:  a  US   nJUR4                  " UJ S35        OUR"                  " S5        US   S&:  a  US   nJUR:                  " UJ S35        OUR"                  " S5        US   S&:  a  US   nJUR:                  " UJ S35        OUR"                  " S5        UR                  " S5        UR                  [        R                     nKUR                  [        R                     nLUR                  [        R                     nMUK(       a   UR"                  " WK SUKS:X  a  SOS S35        WL(       a  UR4                  " WL SULS:X  a  SOS 35        WM(       a6  UR0                  " WM SUMS:X  a  SOS 35        [        R                  " S5        g g ! , (       d  f       GN|= fs  snf s  snf s  sn#f ! , (       d  f       GN= f! , (       d  f       GN[= fs  sn0n/f s  sn0n/f s  sn'n&f ! , (       d  f       GNp= fs  sn&f s  sn&f s  sn&f s  sn&f )N)no_printprettyr>   )rJ   training)schemafrozen_componentszData file validationtrain_corpus
dev_corpusc                     > T" T 5      $ N )nlprT   s   rK   <lambda>debug_data.<locals>.<lambda>   s
    <,rM   z%Pipeline can be initialized with datazCorpus is loadableT)	make_projFtextszTraining statsz
Language: zTraining pipeline: , z!Components from other pipelines: zFrozen components: z training docsz evaluation docszNo evaluation docsz* training examples also in evaluation dataz/No overlap between training and evaluation dataz0Low number of examples to train a new pipeline ()z!It's recommended to use at least z examples (minimum )showzVocab & Vectorsn_wordsz total word(s) in the data (wordsz unique)n_misaligned_wordsr   z' misaligned tokens in the training dataz" misaligned tokens in the dev data
   z10 most common words: )countszfloret vectors with z
 vectors, z dimensions, -z char n-gram subwordsz
 vectors (z unique keys, z dimensions)words_missing_vectorsz3{} words in training data without vectors ({:.0f}%)r0   z(10 most common words without vectors: {}z&No word vectors present in the packagespancatspancat_singlelabelzSpan Categorizationz	Spans KeyLabels)headerdividerzLabel counts in train data: zKey: zLabel 'z-' is not present in the model labels of key 'z*'. Performance may degrade after training.z"Low number of examples for label 'z
' in key '' (zAnalyzing label distribution...z)No examples for texts WITHOUT new label ''z!Obtaining span characteristics...z$Span characteristics for spans_key 'z8SD = Span Distinctiveness, BD = Boundary Distinctivenessspans_length)	thresholdzOver z % of spans have lengths of 1 -- z (min=
min_lengthz, max=
max_lengthz%). The most common span lengths are: z. If you are using the n-gram suggester, note that omitting infrequent n-gram lengths can greatly improve speed and memory usage.z#Full distribution of span lengths: avg_sdz5Spans may not be distinct from the rest of the corpusz.Spans are distinct from the rest of the corpusp_spansz10 most common span tokens: {}avg_bdz<Boundary tokens are not distinct from the rest of the corpusz8Boundary tokens are distinct from the rest of the corpusp_boundsz'10 most common span boundary tokens: {}z<To train a new span type, your data should include at least z instances of the new labelz&Good amount of examples for all labelszpTraining data should always include examples of spans in context, as well as examples without a given span type.z5Examples without occurrences available for all labelsnerc              3   4   #    U  H  oS ;  d  M
  Uv   M     g7f)Orf   NNrX   .0labels     rK   	<genexpr>debug_data.<locals>.<genexpr>j  s      
5eFV9VEE5s   		zNamed Entity Recognitionz	 label(s)z) missing value(s) (tokens with '-' label)zEmpty label found in train datazLabels in train data: z|Some model labels are not present in the train data. The model performance may be degraded for these labels after training: .ws_entsz  invalid whitespace entity spansboundary_cross_entsz, entity span(s) crossing sentence boundariesz<No entities consisting of or starting/ending with whitespacez(No entities crossing sentence boundariesz>To train a new entity type, your data should include at least zuTraining data should always include examples of entities in context, as well as examples without a given entity type.z`Entity spans consisting of or starting/ending with whitespace characters are considered invalid.textcatz'Text Classification (Exclusive Classes)zText Classification: zLabels: catszWPotential train/dev mismatch: the train and dev labels are not the same. Train labels: z. Dev labels: r   zThe model does not have enough labels. 'textcat' requires at least two labels due to mutually-exclusive classes, e.g. LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary classification task.n_cats_bad_valueszMUnsupported values for cats: the supported values are 1.0/True and 0.0/False.n_cats_multilabelzThe train data contains instances without mutually-exclusive classes. Use the component 'textcat_multilabel' instead of 'textcat'.zThe dev data contains instances without mutually-exclusive classes. Use the component 'textcat_multilabel' instead of 'textcat'.textcat_multilabelz Text Classification (Multilabel)zPotential train/dev mismatch: the train data contains instances without mutually-exclusive classes while the dev data contains only instances with mutually-exclusive classes.zThe train data contains only instances with mutually-exclusive classes. You can potentially use the component 'textcat' instead of 'textcat_multilabel'.zTrain/dev mismatch: the dev data contains instances without mutually-exclusive classes while the train data contains only instances with mutually-exclusive classes.taggerzPart-of-speech Taggingtagsz label(s) in train dataz  is the normalised label entropymorphologizerzMorphologizer (POS+Morph)morphsparserzDependency ParsingzFound n_sentsz' sentence(s) with an average length of z.1fz words.g?zThe training data contains z.2fz sentences per document. When there are very few documents containing more than one sentence, the parser will not learn how to segment longer texts into sentences.deps	n_nonprojz  nonprojective train sentence(s)z nonprojective dev sentence(s)z% label(s) in projectivized train dataz: zLow number of examples for z label(s) in the projectivized dependency trees used for training. You may want to projectivize labels such as punct before training in order to improve parser performance.z3Projectivized labels with low numbers of examples: z7The following labels were found only in the train data:z5The following labels were found only in the dev data:z5To train a parser, your data should include at least z instances of each label.rootsr%   zMultiple root labels (zq) found in training data. spaCy's parser uses a single root label ROOT so this distinction will not be available.z. nonprojective projectivized train sentence(s)n_cyclesz, projectivized train sentence(s) with cyclestrainable_lemmatizerzTrainable Lemmatizerlemmatizer_treesz. lemmatizer trees generated from training dataz) lemmatizer trees generated from dev dataz lemmatizer trees (z7% of dev trees) were found exclusively in the dev data.z/All trees in dev data present in training data.n_low_cardinality_lemmasz) training docs with 0 or 1 unique lemmas.z$ dev docs with 0 or 1 unique lemmas.no_lemma_annotationsz) training docs with no lemma annotations.z)All training docs have lemma annotations.z$ dev docs with no lemma annotations.z$All dev docs have lemma annotations.partial_lemma_annotationsz. training docs with partial lemma annotations.z2All training docs have complete lemma annotations.z) dev docs with partial lemma annotations.z-All dev docs have complete lemma annotations.Summary checkchecksz passedwarningwarningserrorerrors)Mr   r-   r   load_configload_model_from_configconfiginterpolater"   resolver   r!   
pipe_namesget_pipe_metafactoryrl   r#   
initializegoodlist_compile_goldtextlangjoinlenfailintersectionrG   BLANK_MODEL_THRESHOLDBLANK_MODEL_MIN_THRESHOLDinfomost_common_format_labelsvocabvectorsmodeVectorsModefloretvectors_lengthminnmaxnn_keyssumvaluesformat_get_labels_from_spancattableitemskeysNEW_LABEL_THRESHOLDloading_get_examples_without_label_get_span_characteristics_print_span_characteristics_get_spans_length_freq_dist_filter_spans_length_freq_dist SPAN_LENGTH_THRESHOLD_PERCENTAGEmax_format_freqsSPAN_DISTINCT_THRESHOLDr   BOUNDARY_DISTINCT_THRESHOLDset_get_labels_from_modelzipnumpyarraylog2DEP_LABEL_THRESHOLDr   appendre   r   GOODWARNFAILsysexit)Pr<   rC   r>   r?   r@   rD   r   cfgr   Tsourced_componentsrS   presume_componentspipelinepipefactory_names	dot_namesrU   train_datasetdev_datasetgold_train_datagold_train_unpreprocessed_datagold_dev_datatrain_texts	dev_textsoverlapr   ra   n_misalignedmost_common_wordsn_missing_vectorsmodel_labels_spancathas_low_data_warninghas_no_neg_warning	spans_keydata_labelsdata_labels_in_componentr}   countspans_key_in_modelneg_docsspan_characteristics_span_freqs_filtered_span_freqsrt   all_span_tokensw_most_common_spansrv   all_span_bound_tokensmost_common_boundslabelslabel_countsmodel_labelshas_ws_ents_errorhas_boundary_cross_ents_warningmissing_valueslabels_with_countsmissing_labels
label_listre   norm_entropysents_per_doclabels_trainlabels_train_unpreprocessed
labels_devr   rare_projectivized_labelstrees_train	trees_devdev_not_trainpctngood_countswarn_countsfail_countsrY   rT   sP                                                                                 @@rK   rI   rI   j   s    C 
{	+{G))#.'')VJ/8LM	 
, 04-.$6U$6qCT:T$6U~~HADPS&&t,44MPKK&' >"AlO4I0CL*NN,-HH45c*+Mz#'KHH!" $M=#QUVO%2}cU&" "+}cTRM!'*Kg&I-.KK !HHz#(($%HH"499X#6"7894TYY?P5Q4RST&tyy1B'C&DEFHHM"#>23HHK !!123}%&+**956GG9FGHBC ]!36K!KA#mBTAUUVW} 99HHTNHHTN/0E/F G12!5	
 KK!"i(GHH)/OG4L0M/NhW +,q0&';<L>!HIJ)*Q.$%9:L>!CDE'0<<R@HH
 0A$!O PQ 39999!![%7%77HH&s399+<+<'=&>j99++,M99$$))*!CII,=,=,B,B+C D"# HHsyy(()**SYY5F5F5M5M4N O  #		 8 89G !$O4K$L$S$S$U VHHELL%,y/IIJ HH:AA"'(?@LLRP#  	9:M!%:m%K7<$")*		&X/FPTU/g>&5i&@&F&F&H"I{HH	{"^K4E4E4GPT%U$VW 'I 2668$
8	 y1)<<8 	! $
 '?&D&D&F"I{ + 1 1 3u%.2F2K2K2M%M"&!5i!@@HH!%(UV_U` aB B
 //HH<UG:i[X[\a[bbcd ,0([[!BC:%ui H D q=HHHqQR)-&/ !42 @A'@!?I($ B
 HH;I;aHIHHOP'(<=5/	:K $B'G$  HH899Y+00234 5,\:;6BVWcBdAe f55BCW5X4Y Z   HH5mK6P5QR $H-0GGPQIJ*95<<>G'*7GI'>O/>/J/J2/N O/Ntq!/N OHH077"#45 	 $H-0KKWXST+J7>>@H-079-E!0E0Q0QRT0U!V0U1!0U!VHH9@@"#56 	e 'Gr  HH,--HJ HH=>HH 	 HHLM 
.u5
 
 'u--c59$"!*/'./C%&i01%c*N##LMNE5zQ:; 
 !- 8 8 :
 :u| UEN : 	 

 ,,>tL)*<)=>WM%.HH+N;<A?
 9%HH	233STU $EE"&998s<PUCVBWWXY (,$[[!BC:=%QVWH Dq=HHHqQR)-&  01HH"#89::fg /3+#HH=>!HHLM HHST.HH?@HH,--HJ
 HH 	 HHE
 M!=>'Y7(VY?@8N62347C#of&=">>HH+N;<A?
 v&'3}V/D+EEHH!!/0G!H I J-mF.CDEQH v;?HH' /01401A5HH* ./!3 HH
 ,-1HH },67'-AB(VY?@8N62347C#of&=">>HH+N;<A?
 v&'3}V/D+EEHH!!/0G!H I J-mF.CDEQH /01401A5HH* ./!301Q6 HHG
 01A5O = ,- /&"9"?"?"AB
FC
O$$;<=KKKUZZ]*//1EJJs:4OOL>!ABC-c8<Z%.HH+N;<A?
 ,F#//1$
 	#'2-'/0)8)BC)Be)B
C-c?CC
O$$;<=Z%.HH+N;<A?
 ,H%113D
 	#'2= $() 	_Y/0 1(3oi6PPQTUU\^	
 (	2S9Q5RR3HH-mC-@ A/ 0 ,;6+BC+B%+BC=fE'
EeEE 	$ '
 *7v)>?)>e)>
?)+6:6{CIHHvi[(HIJ%)%k2IHHvi[(FGHC3455LMNC%&&KLM+*62>>@
 	#'2 4F;E-f5e<@SS8 @6v>uEFaI (,$ < %'!$V,E'.2EE&)00gR 7 >?@ - ()A-HH-c2K.L-M NJ J HHE		34
 $(  |s:.HHI		#l+c*o=> z?S..HHG		#j/C,==>  HH,--FH -g67!;HHII<WEFG HHI ;'!+HH56 72 3 :&*HH455ab .*+ /0B C+,>?	 	C$%%STUC	N##LMN!K/}"m$s9~5CHH}%&&9#c'# G; ; HHFG56: :;AHHsCDE34q889AHHs>?@12Q6 67AHHsCDEHH@A/01445AHHs>?@HH;<67!; ;<AHHsHIJHHIJ4599:AHHsCDEHHDEKK	**X]]+K**X]]+K**X]]+KK=[A-='8!LGTUK={a/?)Z!PQRK=[A-='8!LMN A 
,	+ VPn$
, DC BAN !P  "W\
2 DCB DJ D'
 @s   A!A_3	A`'A`	%A`
?A`7A`A`&A`8*A`>AaAaAa
D
AaHAa!H2Aa&IAa+_3
A``
A`#`&
A`5	a

Aa		file_pathr   returnc                    U R                   S   nU R                  S:X  aL  UR                  " SU S35         [        R                  " U 5      nS S S 5        UR
                  " SU 35        W$ U R                  S:X  aL  UR                  " SU S35         [        R                  " U 5      nS S S 5        UR
                  " SU 35        W$ UR                  " SU R                   3SS	S
9  g ! , (       d  f       N= f! , (       d  f       NV= f)Nz.jsonzLoading z...zLoaded z.jsonlzCan't load file extension zExpected .json or .jsonlr%   )exits)partssuffixr   srsly	read_jsonr   
read_jsonlr   )r  r   	file_namer2   s       rK   
_load_filer'    s    #I7"[[8I;c23??9-D 479+&'			X	%[[8I;c23##I.D 479+&'HH
$Y%5%5$67" 43
 43s   C(C9(
C69
Dexamplesr   rY   r\   c                    0 S[        5       _S[        5       _S[        5       _S[        5       _S[        5       _S[        5       _S[        5       _S[        5       _S	[        5       _S
[        5       _S[        5       _SS_SS_SS_SS_S[        5       _SS_SSSS[        5       [        5       SSSS.	EnSU;   a  [        UR                  R
                  5      nU  GH  nUR                  nUR                  nU V	s/ s H  oR                  PM     n
n	US   R                  U
5        US==   [        U
5      -  ss'   UR                  nU HZ  nUR                  R                  5       (       a  M$  UR                  R                  UR                      S:w  d  MM  US==   S-  ss'   M\     US   R#                  UR                  5        [        UR                  R$                  5      (       ak  U Vs/ s H  oR                  PM     sn HK  nUR                  R
                  U   UR                  R$                  ;  d  M6  US   R                  U/5        MM     SU;   a  UR'                  5       n[)        UR+                  5       5       H  u  nnUc  M  UR-                  S5      (       a!  UU   R.                  (       a  US==   S-  ss'   UR-                  S5      (       a  [1        U5      nUS   U==   S-  ss'   UU   (       a%  UR-                  S5      (       a  US==   S-  ss'   M  US:X  d  M  US   S==   S-  ss'   M     SU;   d  SU;   Ga  [3        UR                  R4                  R7                  5       5       GH  nUUS   ;  a  [        5       US   U'   [)        UR                  R4                  U   5       H2  u  nnUR8                  c  M  US   U   UR8                  ==   S-  ss'   M4     UUS	   ;  a  [        5       US	   U'   UR4                  U    Hj  nUR8                  c  M  UR8                  US	   U   ;  a  / US	   U   UR8                  '   US	   U   UR8                     R;                  [        U5      5        Ml     UUS
   ;  a  [        5       US
   U'   UR4                  U    HR  nUR8                  US
   U   ;  a  / US
   U   UR8                  '   US
   U   UR8                     R;                  U5        MT     SnUUS   ;  a  [        5       US   U'   UR4                  U    H  nUR8                  US   U   ;  a  / / S.US   U   UR8                  '   [=        U5       H  nUR>                  US-   -
  nUS:  a-  US   U   UR8                     S   R;                  UUUS-    5        UR@                  US-   -   nU[        U5      ::  d  Mk  US   U   UR8                     S   R;                  UUS-
  U 5        M     M     GM     SU;   d  S U;   a  US   R                  URB                  5        [E        S! URB                  RG                  5        5       5      (       a  US"==   S-  ss'   [3        URB                  RG                  5       5      RI                  S5      S:w  a  US#==   S-  ss'   S$U;   a9  URK                  S%S&S'9nUS   R                  U V	s/ s H	  oc  M  U	PM     sn	5        S(U;   a  URK                  S)S&S'9nURK                  S*S&S'9n[M        UU5       H  u  nnUb  Uc  M  US+:X  a  US+:X  a  M  [N        RP                  " U5      nU(       a  UU[R        RT                  '   UR                  R                  R
                  UR                  R                  RV                  R#                  U5         nUS   R                  U/5        M     S,U;   a  URY                  US-9u  nn US   R                  U  V	s/ s H	  oc  M  U	PM     sn	5        [)        [M        U U5      5       H3  u  nu  n!n"U"U:X  d  M  US   R                  U!/5        US==   S-  ss'   M5     [Z        R\                  " U5      (       a  US.==   S-  ss'   [Z        R^                  " U5      (       a  US/==   S-  ss'   SU;   d  GM  [a        S0 U 5       5      (       a  US1==   S-  ss'   GM  [E        S2 U 5       5      (       a  US3==   S-  ss'   [        5       n#U H{  nURb                  S:w  d  M  U#R#                  URb                  5        WR#                  UR                  URd                  5      n$URg                  U$5      n%US4   R#                  U%5        M}     [        U#5      S5:  d  GM  [        U5      S:  d  GM  US6==   S-  ss'   GM     U$ s  sn	f s  snf s  sn	f s  sn	f )7Nrw   r   r   r   r   rb   r   rh   ro   spans_per_typesb_per_typer   r   r   ra   rc   rg   r   )	r   r   r   r   r]   r   r   r   r   r   r%   r]   )B-U-L-)r,  r-  )zI-r.  rf   ri   )startendr/  r0  r   r   c              3   *   #    U  H	  oS ;  v   M     g7f))r   r%   NrX   )r|   vals     rK   r~    _compile_gold.<locals>.<genexpr>  s     C0Bf$0Bs   r   r   r   TAGT)	as_stringr   POSMORPH r   )projectivizer   r   c              3   >   #    U  H  oR                   S :H  v   M     g7fr   Nlemmar|   tokens     rK   r~   r3         6;;!#   r   c              3   >   #    U  H  oR                   S :H  v   M     g7fr;  r<  r>  s     rK   r~   r3    r@  rA  r   r   r   r   )4r   dictr   r   r   strings	reference	predictedr   updater   	alignmentorth_isspacex2ylengthsiaddr   get_aligned_sent_starts	enumerateget_aligned_ner
startswithis_spacer    r   spansr   label_r   ranger/  r0  r   anyr   r   get_alignedr   r   feats_to_dictr   POS_FEAT
morphologyget_aligned_parser   is_nonproj_treecontains_cycleallr=  lemma_tree_to_str)&r(  r   rY   r\   r2   treeseggolddocxvalid_wordsalignr?  twordsent_startsrM  r}   combined_labelr   spanwindow_sizeoffsetsb_start_idx
sb_end_idxr   pos_tagsr   posmorph
label_dictaligned_headsaligned_depsdephead	lemma_settree_idtree_strs&                                         rK   r   r     s{	   wy	 		 	')	
 		 	 	 	46 	 	$& 	tv 	1 	q 	1 	a  	 !" 	1#$ E !%&$%5D8 .#))++,||ll'+,t!vvt,W[)Y3{++E{{""$$yy  )Q.)*a/*	 
 	W#((#syy  !!),-A-99$$T*#))2C2CC0188$@ . M!446K%b&8&8&:;5=##$677CFOOOq(O##L11%7%>NK/14/q>e&6&6|&D&D./14/c\K$)$ < %)>-)O!",,"4"4"9"9";<	DO318DOI.(););I)FGGAt{{* Y	24;;?1D?	  H D$886:fD(3 JJy1D{{* {{$~*>y*IIGI^,Y7D(3DKK@GGD	R 2 D)9$::8<D)*95 JJy1D{{$/?*@*KKIK-.y9$++F)*95dkkBII$O 2  D$7759VD'	2 JJy1D{{$}*=i*HH &(#%G]+I6t{{C #("4'+zzVaZ'@'1, /	:4;;GPWW $\L14D E &*XX!%<
%T2 /	:4;;GNUU $Z!^j A #5 2A =j %)=)NL		*C		0@0@0BCCC()Q.)DII$$&'--a0A5()Q.)}$>>%4>8DLD BDqD BCm+~~et~<H^^Gt^<F!(F3
U
 ;%- BY5B; ",!9!9%!@J=@
=#9#9:LL..66**5599*EE N))5'2' 4( }$*,*>*>I*>*V'M<LL JLqL JK"+Cm,L"M;C19M((#/Oq(O #N &&}55[!Q&!%%m44Z A% !]26666+,1,666601Q61I;;!#MM%++.#ii

ELLAG$009H+,00:  9~!c$i!m/0A50g h Kc - .` !C4 !Ks$   9g,gg
)g
g 
g 
r  re   c                     g rW   rX   r  re   s     rK   r   r     s    rM   c                     g rW   rX   r~  s     rK   r   r     s    
 rM   c                 :   U(       aO  SR                  [        [        [        [        [
        4      U 5       VVs/ s H  u  p#SU SU S3PM     snn5      $ SR                  [        [        [           U 5       Vs/ s H	  nSU S3PM     sn5      $ s  snnf s  snf )Nr^   rn   rm   r_   )r   r   r   r   strint)r  re   lcs       rK   r   r     s     yy(,XeCHo-F(OP(Oq3qc^(OP
 	
 99Xc]F(CD(C1!Ah(CDEE QDs   B
;Bfreqssortc           
      f   U(       a"  [        [        U R                  5       5      5      n U R                  5        VVs/ s H  u  p#[        U5      U4PM     nnnSR	                  [        [        [        [        [        4      U5       VVs/ s H  u  pVU SU S3PM     snn5      $ s  snnf s  snnf )Nr^   z (z%))	rC  sortedr   r  r   r   r   r   float)r  r  kv_freqsr  r  s          rK   r   r     s    VEKKM*+&+kkm4mdas1vqkmF499#'sEz1B(CV#LM#L41A3b2#LM  5Ms   B'B-
r}   	component)rw   rh   r   c                 `   SnU  H  nUS:X  a1  UR                  5        Vs/ s H  nUS;  d  M  [        U5      PM     nnUS:X  aN  X5R                  R                  ;   a3  UR                  R                  U    Vs/ s H  owR                  PM     snO/ nWW;  d  M  US-  nM     U$ s  snf s  snf )Nr   rw   ry   rh   r%   )rQ  r    rE  rT  rU  )r2   r}   r  r   r   rc  r  rm  s           rK   r   r     s     E  //11E 00 *"5)1   	!  2 22 *,););I)FG)F)FG  QJE! " L Hs   
B&B&9B+factory_namec                 6   U R                    Vs/ s H&  nU R                  U5      R                  U:X  d  M$  UPM(     nn[        5       nU HF  nU R	                  U5      n[        U[        5      (       d   eUR                  UR                  5        MH     U$ s  snf rW   )	r   r   r   r   get_pipe
isinstancer   rG  r  )rY   r  	pipe_namer   r  r   s         rK   r   r     s     'IY'//<? 	'  
 uF	||I&$....dkk"   Ms
   #BBc                    U R                    Vs/ s H&  nU R                  U5      R                  S;   d  M$  UPM(     nn0 nU Hx  nU R                  U5      n[	        U[
        5      (       d   eUR                  U;  a  [        5       X4R                  '   X4R                     R                  UR                  5        Mz     U$ s  snf )N)rh   ri   )
r   r   r   r  r  r   keyr   rG  r  )rY   r  r   r  r   s        rK   r   r     s     'IY'//3UU 	'  
 #%F	||I&$0000886!"uF88xx,   Ms
   #C C r  c                 |    [         R                  " [         R                  " S U  5       5      [        U 5      -  5      $ )z Compute geometric mean of a listc              3   N   #    U  H  n[         R                  " U5      v   M     g 7frW   )mathlog)r|   rM  s     rK   r~   _gmean.<locals>.<genexpr>'  s     51adhhqkk1s   #%)r  expfsumr   )r  s    rK   _gmeanr  %  s)    88DII5155A>??rM   metricfrequenciesc                    ^ [        U4S jU R                  5        5       5      nU[        TR                  5       5      -  $ )Nc              3   8   >#    U  H  u  pUTU   -  v   M     g 7frW   rX   )r|   	span_typevaluer  s      rK   r~   _wgt_average.<locals>.<genexpr>+  s     V~3C9I..~s   )r   r   r   )r  r  totals    ` rK   _wgt_averager  *  s2    Vv||~VVE3{))+,,,rM   	normalizec           	      z   [        5       nU  HR  nU HI  nUR                  R                  5       R                  SS5      R                  SS5      nX%==   S-  ss'   MK     MT     U(       aK  [	        UR                  5       S5      n[        UR                  5        VVs0 s H
  u  pxXxU-  _M     snn5      nU$ s  snnf )z2Get the frequency distribution given a set of Docsz``"z''r%           )r   r   lowerreplacer   r   r   )	docsr  word_countsre  r?  ri  r  r  r  s	            rK   _get_distributionr  /  s    "9KE

  "**45==dCHANaN  
 K&&(#.8I8I8KL8Kqe)|8KLM Ms   B7
r   qc                 |    SnU R                  5        H%  u  p4X$[        R                  " XAU   -  5      -  -  nM'     U$ )zHCompute the Kullback-Leibler divergence from two frequency distributionsr  )r   r  r  )r   r  r  rj  p_words        rK   _get_kl_divergencer  =  s;    E	$((6dG#3444 "LrM   	span_datac           	         ^ U V^s0 s H  mTT/[        U4S jU  5       5      -   _M     nn[        UR                  5       5      $ s  snf )z*Compile into one list for easier reportingc              3   @   >#    U  H  n[        UT   5      v   M     g 7frW   )r(   )r|   dr}   s     rK   r~   #_format_span_row.<locals>.<genexpr>H  s     J	1nQuX66	s   )r   r   )r  r  r}   r  s     ` rK   _format_span_rowr  E  sT     	E 	wJ	JJJJ  	 
		s   $Acompiled_goldc                    US   U   nUS   U   R                  5        VVs0 s H  u  pEU[        U5      _M     nnnUS   U   R                  5        VVs0 s H  u  pGU[        U5      _M     nnnUS   U   R                  5        Vs/ s H  n[	        U5      PM     n	nUS   U   R                  5        Vs/ s H  n[        U5      PM     n
n[        U  Vs/ s H  oR                  PM     snSS9nUS   U   R                  5        VVs0 s H  u  pGU[        USS9_M     nnnUS   U   R                  5        VVs0 s H  u  pNU[        US   US   -   SS9_M     nnnUR                  5        VVs0 s H  u  nnU[        UU5      _M     nnnUR                  5        VVs0 s H  u  nnU[        UU5      _M     nnnUUUU[	        U	5      [        U
5      [        UU5      [        UU5      [        Xc5      [        UR                  5       5      UUS	.$ s  snnf s  snnf s  snf s  snf s  snf s  snnf s  snnf s  snnf s  snnf )
zObtain all span characteristicsrh   ro   r*  T)r  r+  r/  r0  )sdbdr*  rL  rq   rr   rs   ru   
avg_lengthr  rt   rv   )r   r  r   r   minr   r  rE  r  r  r   r   )r(  r  r   r   r}   r  span_lengthrT  r*  min_lengthsmax_lengthsrc  p_corpusrt   sbrv   	freq_distspan_distinctivenesssb_distinctivenesss                      rK   r   r   N  s     	*95K &n5i@FFHHHE 	vayH   **:;IFLLNNLE 	s5zN   $1#@#K#R#R#TU#Ta3q6#TKU#0#@#K#R#R#TU#Ta3q6#TKU !!B2,,!BdSH **:;IFLLNNLE 	 $77N   '}5i@FFHHIE 	 Gr%y!8DIIH   !( /E9 	!)X66 /   !) 0 0E9 	!)X66 0   # (+&+&3[A1;?";<{'')* A VU "Cs5   HH	H#8H(H-H2? H85H>%Ir   c                 j   Sn[        S[        S U S    5       5      5      nU S   U S   U S   U S   /n[        X0S   S	9nU S
   U S   U S   /nS/U Vs/ s H  nSR                  [        US5      5      PM      sn-   S/-   n[        R
                  " UUUSS/S/[        U5      S-   -  -   US9  gs  snf )z+Print all span characteristics into a table)z	Span TypeLengthSDBDN   c              3   8   #    U  H  n[        U5      v   M     g 7frW   )r   r{   s     rK   r~   ._print_span_characteristics.<locals>.<genexpr>  s     Q2P#e**2Ps   r  rL  r  r  r*  )r  r  r  rs   ru   zWgt. Averagez{:.2f}r   rf   Tr  rr%   )footerrk   rl   alignsmax_colN)r   r  r   roundr   r   r   )r   headersr  
table_datar   footer_datafr  s           rK   r   r     s    6G"cQ2Fx2PQQRG 	Y'T"T"-.	J (%CE
 	\*X&X&K 
+N+QHOOE!QK8+NNRUQVV  IIuuK 01 455 Os   %B0length_dictc                 L   / nU R                  5        H  u  p4UR                  U5        M     [        5       nU H+  nUR                  U5      (       a  XV==   S-  ss'   M'  SXV'   M-     0 nUR	                  5        H&  u  pU	[        U5      -  S-  n
[        U
S5      n
XU'   M(     U$ )zDGet frequency distribution of spans length under a certain thresholdr%   g      Y@r   )r   extendr   getr   r   r  )r  rp   all_span_lengthsr  rL  r  rM  freq_dist_percentager  r   
percentages              rK   r   r     s     !'')
( * !I==LALIL	  '335c"233u<
:q)
,6[) 6
  rM   r  rp   c                 `    Sn0 nU R                  5        H  u  pEX!:  a    U$ XSU'   X%-  nM     U$ )zFilter frequency distribution with respect to a threshold

We're going to filter all the span lengths that fall
around a percentage threshold when summed.
r  )r   )r  rp   r  filtered_freq_distr  dists         rK   r   r     sM     E&__.  /3{+ / rM   )F)T)rw   sc)_r  r   collectionsr   pathlibr   typingr   r   r   r   r	   r
   r   r   r   r   r   r   r#  typerwasabir   r   r   r8  r   compatr   languager   r[  r   r   r   r   r   (pipeline._edit_tree_internals.edit_treesr   pipeline._parser_internalsr   "pipeline._parser_internals.nonprojr   schemasr   rQ   r   r    training.initializer!   r"   r#   r   r$   r   _utilr&   r'   r(   r)   r*   r+   r,   r-   r   r   r   r   r   r   r   rE   ContextboolrL   r  rI   r'  r   r   r  r  r   r   r   r   r  r  r  r  r  r   r   r   r   rX   rM   rK   <module>r     s    
         ) )    # D D @ 0 : * 2 8 . )	 	 	       #%   
$RVW *.$O C&;DUYZ #D-4  O_  !`':EHuv{D7fg%<\] 	    ~	 
       L (*!O
O
 38nO
 	O

 O
 O
 O
d$ W  &xwx9x 
x 	x
 
#s(^xv 
8C= '%. S  
 
U38_%DM 	 
 F(3-%S/!::;FF 	Fc5j)    ,1#	
7
 '( }	
 	6  C ( tCSM/B  @d @u @
-c5j) - -E -
t w ' g % T
 DI $s) 37m3,0cN3GJ3	#s(^3l d38n  H "B  	#u* 2CJ,/	#u*rM   