
    h                    P   S r SSKJr  SSKJrJrJrJrJrJ	r	J
r
JrJr  SSKrSSKrSSKJrJrJr  SSKJr  SSKJrJrJrJrJrJrJrJrJr  SrS	r  " S
 S5      r!\!RD                  r"\!RF                  r#\!RH                  r$\!r%SS\\&\!4   S\'S\&4S jjr(S\\&\!4   S\'4S jr)S\\&\!4   S\\&\!4   S\*4S jr+SS\\&\!4   S\\&\!4   S\'S\*4S jjr, SS\\&\!4   S\\&   S\*S\\&\*4   4S jjr-  SS\\&\!4   S\\&   S\*S\'S\\&\*4   4
S jjr. S S\\&\!4   S\\&   S\*S\
\&   4S jjr/g)!af  
langcodes knows what languages are. It knows the standardized codes that
refer to them, such as `en` for English, `es` for Spanish and `hi` for Hindi.
Often, it knows what these languages are called *in* a language, and that
language doesn't have to be English.

See README.md for the main documentation, or read it on GitHub at
https://github.com/LuminosoInsight/langcodes/ . For more specific documentation
on the functions in langcodes, scroll down and read the docstrings.

Some of these functions, particularly those that work with the names of
languages, require the `language_data` module to be installed.
    )
itemgetter)	AnyListTupleDictSequenceIterableOptionalMappingUnionN)LanguageTagError	parse_tagnormalize_characters)tuple_distance_cached)	ALL_SCRIPTSDEFAULT_SCRIPTSLANGUAGE_REPLACEMENTSLANGUAGE_ALPHA3LANGUAGE_ALPHA3_BIBLIOGRAPHICTERRITORY_REPLACEMENTSNORMALIZED_MACROLANGUAGESLIKELY_SUBTAGSVALIDITYenz
Looking up language names now requires the `language_data` package.

Install it with:
    pip install language_data
Or as an optional feature of langcodes:
    pip install langcodes[data]
c                   r   \ rS rSr% Sr/ SQr1 SkSS1SS1S1S10 /r1 SkSS1S1/r0 r\	\
S 4   \S'   0 r\	\\\4   S 4   \S	'          SLS\\   S\\\      S\\   S\\   S\\\      S\\\      S\\   4S jjr\       SLS\\   S\\\      S\\   S\\   S\\\      S\\\      S\\   SS 4S jj5       r\SMS\\S 4   SS 4S jj5       rS\4S jrSNS jrSNS jrSNS jrSOS\S\4S jjrS\\   4S jrSPS jrSNS jr \ r!SS S\"4S jr#SQSS S\S\"4S  jjr$S\4S! jr%S\4S" jr&S#\S\\S 4   S$\"S\4S% jr'S&\(\\4   SS S$\"4S' jr)\*S(4S\\S 4   S$\"S\4S) jjr+\*S(4S\\S 4   S$\"S\4S* jjr,S\4S+ jr-S\4S, jr.SRS$\"S\4S- jjr/\*S(4S\\S 4   S$\"S\4S. jjr0\*S(4S\\S 4   S$\"S\4S/ jjr1\*S(4S\\S 4   S$\"S\4S0 jjr2\3S1 5       r4\*S(4S\\S 4   S$\"S\\   4S2 jjr5\*S(4S\\S 4   S$\"S\64S3 jjr7S\"4S4 jr8S\"4S5 jr9\ SSS6\S7\S\\\S 4      SS 4S8 jj5       r:\ SSS7\S\\\S 4      SS 4S9 jj5       r;S\64S: jr<STS; jr=S<\6SS 4S= jr>\S>\6S?\?\   S\64S@ j5       r@SA\?\   SS 4SB jrASNSC jrBSD rCS\"4SE jrDSF\S\\\\\   4      4SG jrESF\S\4SH jrFS\4SI jrGS\4SJ jrHSKrIg
)ULanguage0   a%  
The Language class defines the results of parsing a language tag.
Language objects have the following attributes, any of which may be
unspecified (in which case their value is None):

- *language*: the code for the language itself.
- *script*: the 4-letter code for the writing system being used.
- *territory*: the 2-letter or 3-digit code for the country or similar territory
  of the world whose usage of the language appears in this text.
- *extlangs*: a list of more specific language codes that follow the language
  code. (This is allowed by the language code syntax, but deprecated.)
- *variants*: codes for specific variations of language usage that aren't
  covered by the *script* or *territory* codes.
- *extensions*: information that's attached to the language code for use in
  some specific system, such as Unicode collation orders.
- *private*: a code starting with `x-` that has no defined meaning.

The `Language.get` method converts a string to a Language instance.
It's also available at the top level of this module as the `get` function.
languageextlangsscript	territoryvariants
extensionsprivate>   r!   r   r"   r   r"   r!   
_INSTANCES_PARSE_CACHENr    r#   r$   r%   c                    Xl         X l        X0l        X@l        XPl        X`l        Xpl        SU l        SU l        SU l	        SU l
        SU l        SU l        SU l        SU l        SU l        SU l        U R#                  5         g)z
The constructor for Language objects.

It's inefficient to call this directly, because it can't return
an existing instance. Instead, call Language.make(), which
has the same signature.
N)r   r    r!   r"   r#   r$   r%   _simplified_searchable_broader_assumed_filled_macrolanguage_str_tag_dict_disp_separator_disp_patternto_tag)selfr   r    r!   r"   r#   r$   r%   s           L/home/james-whalen/.local/lib/python3.13/site-packages/langcodes/__init__.py__init__Language.__init__e   s    " ! " $ (,'+#'$(#'48!
$("& 	    returnc           
          U[        U=(       d    S5      UU[        U=(       d    S5      [        U=(       d    S5      U4nXR                  ;   a  U R                  U   $ U " UUUUUUUS9n	XR                  U'   U	$ )z
Create a Language object by giving any subset of its attributes.

If this value has been created before, return the existing value.
 r   )tupler&   )
clsr   r    r!   r"   r#   r$   r%   valuesinstances
             r5   makeLanguage.make   s    " (.b!(.b!*"#
 ^^#>>&))!
 "*vr8   tagc                    [        U [        5      (       a  U(       d  U $ U R                  5       n X4[        R                  ;   a  [        R                  X4   $ 0 n[	        U 5      nU(       a  U[
        ;   a	  [
        U   n [        U 5      nU GH  u  pVUS:X  a  U(       a  SU;   a  US    SU 3n[
        R                  " [	        U5      5      nUb4  UR                  [        R                  X5      R                  5       5        My  UR                  S/ 5      R                  U5        M  US;   a&  UR                  US-   / 5      R                  U5        M  US:X  ay  US:X  a  M  U(       ac  [
        R                  " UR                  5       5      n	U	b5  UR                  [        R                  X5      R                  5       5        GM9  XbS'   GM@  XbS'   GMG  US:X  a9  U(       a+  [        R                  " UR                  5       U5      US'   GM  XbS'   GM  US	:X  a  XbS'   GM  XbU'   GM     [        R                  " S
0 UD6n
U
[        R                  X4'   U
$ )a  
Create a Language object from a language tag string.

If normalize=True, non-standard or overlong tags will be replaced as
they're interpreted. This is recommended.

Here are several examples of language codes, which are also test cases.
Most language codes are straightforward, but these examples will get
pretty obscure toward the end.

>>> Language.get('en-US')
Language.make(language='en', territory='US')

>>> Language.get('zh-Hant')
Language.make(language='zh', script='Hant')

>>> Language.get('und')
Language.make()

This function is idempotent, in case you already have a Language object:

>>> Language.get(Language.get('en-us'))
Language.make(language='en', territory='US')

The non-code 'root' is sometimes used to represent the lack of any
language information, similar to 'und'.

>>> Language.get('root')
Language.make()

By default, getting a Language object will automatically convert
deprecated tags:

>>> Language.get('iw')
Language.make(language='he')

>>> Language.get('in')
Language.make(language='id')

One type of deprecated tag that should be replaced is for sign
languages, which used to all be coded as regional variants of a
fictitious global sign language called 'sgn'. Of course, there is no
global sign language, so sign languages now have their own language
codes.

>>> Language.get('sgn-US')
Language.make(language='ase')

>>> Language.get('sgn-US', normalize=False)
Language.make(language='sgn', territory='US')

'en-gb-oed' is a tag that's grandfathered into the standard because it
has been used to mean "spell-check this with Oxford English Dictionary
spelling", but that tag has the wrong shape. We interpret this as the
new standardized tag 'en-gb-oxendict', unless asked not to normalize.

>>> Language.get('en-gb-oed')
Language.make(language='en', territory='GB', variants=['oxendict'])

>>> Language.get('en-gb-oed', normalize=False)
Language.make(language='en-gb-oed')

'zh-min-nan' is another oddly-formed tag, used to represent the
Southern Min language, which includes Taiwanese as a regional form. It
now has its own language code.

>>> Language.get('zh-min-nan')
Language.make(language='nan')

The vague tag 'zh-min' is now also interpreted as 'nan', with a private
extension indicating that it had a different form:

>>> Language.get('zh-min')
Language.make(language='nan', private='x-zh-min')

Occasionally Wiktionary will use 'extlang' tags in strange ways, such
as using the tag 'und-ibe' for some unspecified Iberian language.

>>> Language.get('und-ibe')
Language.make(extlangs=['ibe'])

Here's an example of replacing multiple deprecated tags.

The language tag 'sh' (Serbo-Croatian) ended up being politically
problematic, and different standards took different steps to address
this. The IANA made it into a macrolanguage that contains 'sr', 'hr',
and 'bs'. Unicode further decided that it's a legacy tag that should
be interpreted as 'sr-Latn', which the language matching rules say
is mutually intelligible with all those languages.

We complicate the example by adding on the territory tag 'QU', an old
provisional tag for the European Union, which is now standardized as
'EU'.

>>> Language.get('sh-QU')
Language.make(language='sr', script='Latn', territory='EU')
extlangr   -r    >   rD   variant	extensionsundr"   grandfatheredr;   )
isinstancer   r3   r'   r   r   r   getupdateto_dict
setdefaultappendlowerr   r@   )rB   	normalizedata	tag_lower
componentstypvalueminitagnormreplacementresults              r5   rL   Language.get   s   F c8$$

 **,Cx444((88! )-	&;;'	2Cs^
$JCiI*2D!*-.aw7,001Eg1NO#KKT = E E GHOOJ3::5A;;c	2.55e<
"E>"7";";EKKM"JK". HLL$H$P$P$RS+0Z(',$#(>(B(B5;;=RW(XD%(-%'
 $)Z !S	I %L &&06cn-r8   c                 (   U R                   b  U R                   $ S/nU R                  (       a  U R                  US'   U R                  (       a-  [        U R                  5       H  nUR	                  U5        M     U R
                  (       a  UR	                  U R
                  5        U R                  (       a  UR	                  U R                  5        U R                  (       a-  [        U R                  5       H  nUR	                  U5        M     U R                  (       a$  U R                   H  nUR	                  U5        M     U R                  (       a  UR	                  U R                  5        SR                  U5      U l         U R                   $ )a  
Convert a Language back to a standard language tag, as a string.
This is also the str() representation of a Language object.

>>> Language.make(language='en', territory='GB').to_tag()
'en-GB'

>>> Language.make(language='yue', script='Hant', territory='HK').to_tag()
'yue-Hant-HK'

>>> Language.make(script='Arab').to_tag()
'und-Arab'

>>> str(Language.make(territory='IN'))
'und-IN'
rI   r   rE   )r/   r   r    sortedrP   r!   r"   r#   r$   r%   join)r4   subtagsrD   rF   exts        r5   r3   Language.to_tag\  s    " ==$== '==GAJ==!$--0w' 1;;NN4;;'>>NN4>>*==!$--0w' 1??s# '<<NN4<<()}}r8   c                 B   U R                   b  U R                   $ U R                  (       ad  U R                  (       aS  [        R                  " U R                  5      U R                  :X  a%  U R                  SS05      nXl         U R                   $ X l         U R                   $ )a  
Remove the script from some parsed language data, if the script is
redundant with the language.

>>> Language.make(language='en', script='Latn').simplify_script()
Language.make(language='en')

>>> Language.make(language='yi', script='Latn').simplify_script()
Language.make(language='yi', script='Latn')

>>> Language.make(language='yi', script='Hebr').simplify_script()
Language.make(language='yi')
Nr!   )r)   r   r!   r   rL   update_dict)r4   r[   s     r5   simplify_scriptLanguage.simplify_script  s}     '###==T[[""4==1T[[@))8T*:;#) '''r8   c                 J   U R                   b  U R                   $ U R                  (       aG  U R                  (       d6   U R                  S[        U R                     05      U l         U R                   $ X l         U R                   $ ! [
         a    X l          U R                   $ f = f)ao  
Fill in the script if it's missing, and if it can be assumed from the
language subtag. This is the opposite of `simplify_script`.

>>> Language.make(language='en').assume_script()
Language.make(language='en', script='Latn')

>>> Language.make(language='yi').assume_script()
Language.make(language='yi', script='Hebr')

>>> Language.make(language='yi', script='Latn').assume_script()
Language.make(language='yi', script='Latn')

This fills in nothing when the script cannot be assumed -- such as when
the language has multiple scripts, or it has no standard orthography:

>>> Language.make(language='sr').assume_script()
Language.make(language='sr')

>>> Language.make(language='eee').assume_script()
Language.make(language='eee')

It also dosn't fill anything in when the language is unspecified.

>>> Language.make(territory='US').assume_script()
Language.make(territory='US')
r!   )r,   r   r!   rd   r   KeyErrorr4   s    r5   assume_scriptLanguage.assume_script  s    8 ==$== ==% $ 0 0t}}=>! }} !M}}	  % $ }}	%s   )B B"!B"c                     U R                   b  U R                   $ U R                  =(       d    SnU[        ;   a+  U R                  S[        U   05      U l         U R                   $ X l         U R                   $ )a  
BCP 47 doesn't specify what to do with macrolanguages and the languages
they contain. The Unicode CLDR, on the other hand, says that when a
macrolanguage has a dominant standardized language, the macrolanguage
code should be used for that language. For example, Mandarin Chinese
is 'zh', not 'cmn', according to Unicode, and Malay is 'ms', not 'zsm'.

This isn't a rule you'd want to follow in all cases -- for example, you may
want to be able to specifically say that 'ms' (the Malay macrolanguage)
contains both 'zsm' (Standard Malay) and 'id' (Indonesian). But applying
this rule helps when interoperating with the Unicode CLDR.

So, applying `prefer_macrolanguage` to a Language object will
return a new object, replacing the language with the macrolanguage if
it is the dominant language within that macrolanguage. It will leave
non-dominant languages that have macrolanguages alone.

>>> Language.get('arb').prefer_macrolanguage()
Language.make(language='ar')

>>> Language.get('cmn-Hant').prefer_macrolanguage()
Language.make(language='zh', script='Hant')

>>> Language.get('yue-Hant').prefer_macrolanguage()
Language.make(language='yue', script='Hant')
rI   r   )r.   r   r   rd   )r4   r   s     r5   prefer_macrolanguageLanguage.prefer_macrolanguage  sv    6 *&&&==)E00"&"2"26x@A#D
 """ #'"""r8   rF   c                     UR                  5       nUS;  a  [        S5      eU R                  nUc  g[        U5      S:X  a  U$ US:X  a  U[        ;   a	  [        U   $ U[
        ;   a	  [
        U   $ [        U< S35      e)a  
Get the three-letter language code for this language, even if it's
canonically written with a two-letter code.

These codes are the 'alpha3' codes defined by ISO 639-2.

When this function returns, it always returns a 3-letter string. If
there is no known alpha3 code for the language, it raises a LookupError.

In cases where the distinction matters, we default to the 'terminology'
code. You can pass `variant='B'` to get the 'bibliographic' code instead.
For example, the terminology code for German is 'deu', while the
bibliographic code is 'ger'.

(The confusion between these two sets of codes is a good reason to avoid
using alpha3 codes. Every language that has two different alpha3 codes
also has an alpha2 code that's preferred, such as 'de' for German.)

>>> Language.get('fr').to_alpha3()
'fra'
>>> Language.get('fr-CA').to_alpha3()
'fra'
>>> Language.get('fr').to_alpha3(variant='B')
'fre'
>>> Language.get('de').to_alpha3(variant='T')
'deu'
>>> Language.get('ja').to_alpha3()
'jpn'
>>> Language.get('un').to_alpha3()
Traceback (most recent call last):
    ...
LookupError: 'un' is not a known language code, and has no alpha3 code.


All valid two-letter language codes have corresponding alpha3 codes,
even the un-normalized ones. If they were assigned an alpha3 code by ISO
before they were assigned a normalized code by CLDR, these codes may be
different:

>>> Language.get('tl', normalize=False).to_alpha3()
'tgl'
>>> Language.get('tl').to_alpha3()
'fil'
>>> Language.get('sh', normalize=False).to_alpha3()
'hbs'


Three-letter codes are preserved, even if they're unknown:

>>> Language.get('qqq').to_alpha3()
'qqq'
>>> Language.get('und').to_alpha3()
'und'
BTzVariant must be 'B' or 'T'rI      Bz6 is not a known language code, and has no alpha3 code.)upper
ValueErrorr   lenr   r   LookupError)r4   rF   r   s      r5   	to_alpha3Language.to_alpha3  s    n --/$9::==]aO#~(.K"K4X>>_,&x00!l #. . r8   c                    U R                   b  U R                   $ U R                  5       /U l         [        U R                  5       /5      nU R                   Ho  nX R	                  5       4 HW  nUR                  U5      nUR                  5       nXQ;  d  M+  U R                   R                  U5        UR                  U5        MY     Mq     U R                   $ )aC  
Iterate through increasingly general tags for this language.

This isn't actually that useful for matching two arbitrary language tags
against each other, but it is useful for matching them against a known
standardized form, such as in the CLDR data.

The list of broader versions to try appears in UTR 35, section 4.3,
"Likely Subtags".

>>> Language.get('nn-Latn-NO-x-thingy').broader_tags()
['nn-Latn-NO-x-thingy', 'nn-Latn-NO', 'nn-NO', 'nn-Latn', 'nn', 'und-Latn', 'und']

>>> Language.get('arb-Arab').broader_tags()
['arb-Arab', 'ar-Arab', 'arb', 'ar', 'und-Arab', 'und']
)r+   r3   setBROADER_KEYSETSrm   _filter_attributesrP   add)r4   seenkeysetstart_languagefilteredrB   s         r5   broader_tagsLanguage.broader_tags8  s    " ==$== DKKM?#**F#')B)B)D"E)<<VDoo'?MM((-HHSM #F + }}r8   c                 t    U R                  5        Vs/ s H  n[        R                  U5      PM     sn$ s  snf )zH
Like `broader_tags`, but returrns Language objects instead of strings.
)r   r   rL   )r4   rB   s     r5   broadenLanguage.broadenV  s0     .2->->-@A-@cS!-@AAAs   5c                     U R                   b  U R                   $ U R                  5        HC  nU[        ;   d  M  [        R	                  [        U   SS9nUR                  U 5      nX l         Us  $    [        S5      e)a]  
The Unicode CLDR contains a "likelySubtags" data file, which can guess
reasonable values for fields that are missing from a language tag.

This is particularly useful for comparing, for example, "zh-Hant" and
"zh-TW", two common language tags that say approximately the same thing
via rather different information. (Using traditional Han characters is
not the same as being in Taiwan, but each implies that the other is
likely.)

These implications are provided in the CLDR supplemental data, and are
based on the likelihood of people using the language to transmit text
on the Internet. (This is why the overall default is English, not
Chinese.)

It's important to recognize that these tags amplify majorities, and
that not all language support fits into a "likely" language tag.

>>> str(Language.get('zh-Hant').maximize())
'zh-Hant-TW'
>>> str(Language.get('zh-TW').maximize())
'zh-Hant-TW'
>>> str(Language.get('ja').maximize())
'ja-Jpan-JP'
>>> str(Language.get('pt').maximize())
'pt-Latn-BR'
>>> str(Language.get('und-Arab').maximize())
'ar-Arab-EG'
>>> str(Language.get('und-CH').maximize())
'de-Latn-CH'

As many standards are, this is US-centric:

>>> str(Language.make().maximize())
'en-Latn-US'

"Extlangs" have no likely-subtags information, so they will give
maximized results that make no sense:

>>> str(Language.get('und-ibe').maximize())
'en-ibe-Latn-US'
FrR   zWCouldn't fill in likely values. This represents a problem with the LIKELY_SUBTAGS data.)r-   r   r   r   rL   rM   RuntimeError)r4   rB   r[   s      r5   maximizeLanguage.maximize\  sw    V <<#<<$$&Cn$!nS&9UKt,% ' '
 	
r8   	supportedc                 t    [         R                  " S[        5        S[        U R	                  U5      S5      -
  $ )zj
DEPRECATED: use .distance() instead, which uses newer data and is _lower_
for better matching languages.
z`match_score` is deprecated because it's based on deprecated CLDR info. Use `distance` instead, which is _lower_ for better matching languages. d   )warningswarnDeprecationWarningmindistance)r4   r   s     r5   match_scoreLanguage.match_score  s6    
 	W	

 Sy13777r8   ignore_scriptc                    X:X  a  gU R                   c  U R                  c  U R                  c  SnOJU R                  5       R	                  5       nUR                   U(       a  SOUR                  UR                  4nUR                   c  UR                  c  UR                  c  SnOJUR                  5       R	                  5       nUR                   U(       a  SOUR                  UR                  4n[        X55      $ )a  
Suppose that `self` is the language that the user desires, and
`supported` is a language that is actually supported.

This method returns a number from 0 to 134 measuring the 'distance'
between the languages (lower numbers are better). This is not a
symmetric relation. If `ignore_script` is `True`, the script will
not be used in the comparison, possibly resulting in a smaller
'distance'.

The language distance is not really about the linguistic similarity or
history of the languages; instead, it's based largely on sociopolitical
factors, indicating which language speakers are likely to know which
other languages in the present world. Much of the heuristic is about
finding a widespread 'world language' like English, Chinese, French, or
Russian that speakers of a more localized language will accept.

A version that works on language tags, as strings, is in the function
`tag_distance`. See that function for copious examples.
r   N)rI   ZzzzZZ)r   r!   r"   rm   r   r   )r4   r   r   desired_tripledesired_completesupported_triplesupported_completes          r5   r   Language.distance  s    * 
 == T[[%8T^^=S2N#88:CCE !))%+;+B+B **N &  (##+4!*!?!?!A!J!J!L #++%+=+D+D",,  %^FFr8   c                 x   U R                   b  [        U R                   5      S:  a  gU R                  U R                  U R                  /n/ nU R
                  b  UR                  U R
                  5        U HZ  nUc  M  UR                  U5        UR                  S5      (       a  M1  [        R                  " U5      (       a  MN  U[        ;  d  MZ    g   U R                  (       a.  UR                  U R                   Vs/ s H  oDSS PM	     sn5        [        [        U5      5      [        U5      :w  a  ggs  snf )ad  
Checks whether the language, script, territory, and variants
(if present) are all tags that have meanings assigned by IANA.
For example, 'ja' (Japanese) is a valid tag, and 'jp' is not.

The data is current as of CLDR 40.

>>> Language.get('ja').is_valid()
True
>>> Language.get('jp').is_valid()
False
>>> Language.get('en-001').is_valid()
True
>>> Language.get('en-000').is_valid()
False
>>> Language.get('en-Latn').is_valid()
True
>>> Language.get('en-Latnx').is_valid()
False
>>> Language.get('und').is_valid()
True
>>> Language.get('en-GB-oxendict').is_valid()
True
>>> Language.get('en-GB-oxenfree').is_valid()
False
>>> Language.get('x-heptapod').is_valid()
True

Some scripts are, confusingly, not included in CLDR's 'validity' pattern.
If a script appears in the IANA registry, we consider it valid.

>>> Language.get('ur-Aran').is_valid()
True
>>> Language.get('cu-Cyrs').is_valid()
True

A language tag with multiple extlangs will parse, but is not valid.
The only allowed example is 'zh-min-nan', which normalizes to the
language 'nan'.

>>> Language.get('zh-min-nan').is_valid()
True
>>> Language.get('sgn-ase-bfi').is_valid()
False

These examples check that duplicate tags are not valid:

>>> Language.get('de-1901').is_valid()
True
>>> Language.get('de-1901-1901').is_valid()
False
>>> Language.get('en-a-bbb-c-ddd').is_valid()
True
>>> Language.get('en-a-bbb-a-ddd').is_valid()
False

Of course, you should be prepared to catch a failure to parse the
language code at all:

>>> Language.get('C').is_valid()
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: Expected a language code, got 'c'
N   Fzx-   T)r    ru   r   r!   r"   r#   extendrP   
startswithr   matchr   r$   rz   )r4   r`   checked_subtagssubtagrG   s        r5   is_validLanguage.is_valid  s    B ==$ 4==!A%==$++t~~>==$NN4==)F!&&v.((..x~~f7M7M[0$  ??""4??#S?ibqM?#STs?#$O(<< $Ts   ?D7c                      SSK Jn  [        U R                  5       5      U-  n[        U5      $ ! [         a    [        [        [
        R                  S9  e f = f)a  
Return True when we can name languages in this language. Requires
`language_data` to be installed.

This is true when the language, or one of its 'broader' versions, is in
the list of CLDR target languages.

>>> Language.get('fr').has_name_data()
True
>>> Language.get('so').has_name_data()
True
>>> Language.get('enc').has_name_data()
False
>>> Language.get('und').has_name_data()
False
r   )LANGUAGES_WITH_NAME_DATAfile)
language_data.name_datar   ImportErrorprintLANGUAGE_NAME_IMPORT_MESSAGEsysstdoutrz   r   bool)r4   r   matchess      r5   has_name_dataLanguage.has_name_data7  sR    "	H
 d'')*-EEG}  	.SZZ@	s	   / (A	attributemax_distancec                     SSK Jn  XR                  ;   d   e[        U[        5      (       a  [        R                  U5      n[        X5      nUc
  US:X  a  SnOg U" U5      nU R                  XbU5      nUb  U$ S nUS:X  a  SnOUS:X  a  SnOUS:X  a  S	nS n	Ub  U" U5      nU R                  XbU5      n	U	c  S
n	U	 SU S3$ ! [         a    [        [        [
        R                  S9  e f = f)Nr   )code_to_namesr   r   rI   r!   r   r"   r   zUnknown language subtagz [])language_data.namesr   r   r   r   r   r   
ATTRIBUTESrK   strr   rL   getattr
_best_name)
r4   r   r   r   r   
attr_valuenamesr[   placeholderunknown_names
             r5   	_get_nameLanguage._get_nameU  s   	9
 OO+++h$$||H-HT-
J&"
j),?M KJ&#h&$k)"L&%k2#uM#8"^2j\33G  	.SZZ@	s   C (C*r   c                     [        UR                  5       5      n[        UR                  5       5       Vs/ s H  oUU;   d  M
  UPM     nn[	        X&U5      u  pxXq;   a  X   $ UR                  [        5      $ s  snf N)rz   r   r^   keysclosest_matchrL   DEFAULT_LANGUAGE)	r4   r   r   r   matchable_languageskeypossible_languagestarget_languagescores	            r5   r   Language._best_name  s     "("7"7"9:!%**,/
/C:M3MC/ 	 
 "/,"
 #))99-..
s   	A6A6   c                 &    U R                  SX5      $ )u  
Give the name of the language (not the entire tag, just the language part)
in a natural language. The target language can be given as a string or
another Language object.

By default, things are named in English:

>>> Language.get('fr').language_name()
'French'
>>> Language.get('el').language_name()
'Greek'

But you can ask for language names in numerous other languages:

>>> Language.get('fr').language_name('fr')
'français'
>>> Language.get('el').language_name('fr')
'grec'

Why does everyone get Slovak and Slovenian confused? Let's ask them.

>>> Language.get('sl').language_name('sl')
'slovenščina'
>>> Language.get('sk').language_name('sk')
'slovenčina'
>>> Language.get('sl').language_name('sk')
'slovinčina'
>>> Language.get('sk').language_name('sl')
'slovaščina'
r   r   r4   r   r   s      r5   language_nameLanguage.language_name  s    F ~~j(AAr8   c                    U R                  5       n[        R                  U5      nUR                  X5      n/ nUR                  b   UR                  UR                  X5      5        UR                  b   UR                  UR                  X5      5        U(       a@  UR                  5       R                  U5      nUR                  5       nUR                  XF5      $ U$ )u  
It's often helpful to be able to describe a language code in a way that a user
(or you) can understand, instead of in inscrutable short codes. The
`display_name` method lets you describe a Language object *in a language*.

The `.display_name(language, min_score)` method will look up the name of the
language. The names come from the IANA language tag registry, which is only in
English, plus CLDR, which names languages in many commonly-used languages.

The default language for naming things is English:

    >>> Language.make(language='fr').display_name()
    'French'

    >>> Language.make().display_name()
    'Unknown language'

    >>> Language.get('zh-Hans').display_name()
    'Chinese (Simplified)'

    >>> Language.get('en-US').display_name()
    'English (United States)'

But you can ask for language names in numerous other languages:

    >>> Language.get('fr').display_name('fr')
    'français'

    >>> Language.get('fr').display_name('es')
    'francés'

    >>> Language.make().display_name('es')
    'lengua desconocida'

    >>> Language.get('zh-Hans').display_name('de')
    'Chinesisch (Vereinfacht)'

    >>> Language.get('en-US').display_name('zh-Hans')
    '英语（美国）'
)re   r   rL   r   r!   rP   script_namer"   territory_name_display_separatorr_   _display_patternformat)r4   r   r   reducedr   extra_partsclarificationpatterns           r5   display_nameLanguage.display_name  s    Z &&(<<)--hE>>%w228JK(w55hMN$779>>{KM//1G>>-??  r8   c                     U R                   b  U R                   $ U R                  [        R                  S5      5      S::  d(  U R                  [        R                  S5      5      S::  a  SU l         U R                   $ SU l         U R                   $ )zd
Get the pattern, according to CLDR, that should be used for clarifying
details of a language code.
zhr   zzh-Hantu   {0}（{1}）z	{0} ({1}))r2   r   r   rL   ri   s    r5   r   Language._display_pattern  s}     )%%%==d+,2dmmHLLQZD[6\`b6b!/D !!! "-D!!!r8   c                      SSK Jn  U R                  b  U R                  $ [        XR                  5       5      u  p#X   U l        U R                  $ ! [         a    [        [        [
        R                  S9  e f = f)z
Get the symbol that should be used to separate multiple clarifying
details -- such as a comma in English, or an ideographic comma in
Japanese.

Requires that `language_data` is installed.
r   )DISPLAY_SEPARATORSr   )
r   r   r   r   r   r   r   r1   r   r   )r4   r   matched_dists       r5   r   Language._display_separator  ss    	>
 +'''&t-D-D-FG1:###  	.SZZ@	s   A (A9c                 @    U R                  5       nUR                  X!S9$ )u  
Give the display name of this language *in* this language.
Requires that `language_data` is installed.

>>> Language.get('fr').autonym()
'français'
>>> Language.get('es').autonym()
'español'
>>> Language.get('ja').autonym()
'日本語'

This uses the `display_name()` method, so it can include the name of a
script or territory when appropriate.

>>> Language.get('en-AU').autonym()
'English (Australia)'
>>> Language.get('sr-Latn').autonym()
'srpski (latinica)'
>>> Language.get('sr-Cyrl').autonym()
'српски (ћирилица)'
>>> Language.get('pa').autonym()
'ਪੰਜਾਬੀ'
>>> Language.get('pa-Arab').autonym()
'پنجابی (عربی)'

This only works for language codes that CLDR has locale data for. You
can't ask for the autonym of 'ja-Latn' and get 'nihongo (rōmaji)'.
)r   r   )rm   r   )r4   r   langs      r5   autonymLanguage.autonym  s&    : ((*  $ JJr8   c                 &    U R                  SX5      $ )zq
Describe the script part of the language tag in a natural language.
Requires that `language_data` is installed.
r!   r   r   s      r5   r   Language.script_name6  s     ~~h??r8   c                 &    U R                  SX5      $ )zt
Describe the territory part of the language tag in a natural language.
Requires that `language_data` is installed.
r"   r   r   s      r5   r   Language.territory_nameA  s     ~~k8BBr8   c                 Z    [         R                  " S[        5        U R                  X5      $ )NzB`region_name` has been renamed to `territory_name` for consistency)r   r   r   r   r   s      r5   region_nameLanguage.region_nameL  s)    
 	P	
 ""8::r8   c                 P    [         R                  " S[        5        U R                  $ )NzEThe `region` property has been renamed to `territory` for consistency)r   r   r   r"   ri   s    r5   regionLanguage.regionW  s     S	
 ~~r8   c                 b    [         R                  " S[        5        U R                  =(       d    / $ )z
Deprecated in version 3.0.

We don't store names for variants anymore, so this just returns the list
of variant codes, such as ['oxendict'] for en-GB-oxendict.
z>variant_names is deprecated and just returns the variant codes)r   r   r   r#   r   s      r5   variant_namesLanguage.variant_names_  s'     	L	
 }}""r8   c                     0 nU R                   (       a  U R                  X5      US'   U R                  (       a  U R                  X5      US'   U R                  (       a  U R                  X5      US'   U$ )u  
Return a dictionary that describes a given language tag in a specified
natural language. Requires that `language_data` is installed.

See `language_name` and related methods for more specific versions of this.

The desired `language` will in fact be matched against the available
options using the matching technique that this module provides. We can
illustrate many aspects of this by asking for a description of Shavian
script (a phonetic script for English devised by author George Bernard
Shaw), and where you might find it, in various languages.

>>> shaw = Language.make(script='Shaw').maximize()
>>> shaw.describe('en')
{'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}

>>> shaw.describe('fr')
{'language': 'anglais', 'script': 'shavien', 'territory': 'Royaume-Uni'}

>>> shaw.describe('es')
{'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'}

>>> shaw.describe('pt')
{'language': 'inglês', 'script': 'shaviano', 'territory': 'Reino Unido'}

>>> shaw.describe('uk')
{'language': 'англійська', 'script': 'шоу', 'territory': 'Велика Британія'}

>>> shaw.describe('arb')
{'language': 'الإنجليزية', 'script': 'الشواني', 'territory': 'المملكة المتحدة'}

>>> shaw.describe('th')
{'language': 'อังกฤษ', 'script': 'ซอเวียน', 'territory': 'สหราชอาณาจักร'}

>>> shaw.describe('zh-Hans')
{'language': '英语', 'script': '萧伯纳式文', 'territory': '英国'}

>>> shaw.describe('zh-Hant')
{'language': '英文', 'script': '簫柏納字符', 'territory': '英國'}

>>> shaw.describe('ja')
{'language': '英語', 'script': 'ショー文字', 'territory': 'イギリス'}

When we don't have a localization for the language, we fall back on English,
because the IANA provides names for all known codes in English.

>>> shaw.describe('lol')
{'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}

When the language tag itself is a valid tag but with no known meaning, we
say so in the appropriate language.

>>> Language.get('xyz-ZY').display_name()
'Unknown language [xyz] (Unknown Region [ZY])'

>>> Language.get('xyz-ZY').display_name('es')
'lengua desconocida [xyz] (Región desconocida [ZY])'
r   r!   r"   )r   r   r!   r   r"   r   )r4   r   r   r   s       r5   describeLanguage.describep  sb    ~ == $ 2 28 JE*;;"..xFE(O>>!%!4!4X!LE+r8   c                      SSK Jn  U R                  SS/5      nUR                  [        U5      S5      $ ! [         a    [        [        [
        R                  S9  e f = f)a  
Get an estimate of how many people in the world speak this language,
derived from CLDR data. Requires that `language_data` is installed.

Only the language and territory codes will be considered. If a
territory code is included, the population will count only the
speakers of the language in that territory.

Script subtags are disregarded, because it doesn't make sense to ask
how many people speak in a particular writing script.

>>> Language.get('es').speaking_population()
493528077
>>> Language.get('pt').speaking_population()
237496885
>>> Language.get('es-BR').speaking_population()
76218
>>> Language.get('pt-BR').speaking_population()
192661560
>>> Language.get('vo').speaking_population()
0
r   )LANGUAGE_SPEAKING_POPULATIONr   r   r"   )
language_data.population_datar  r   r   r   r   r   r|   rL   r   )r4   r  r   s      r5   speaking_populationLanguage.speaking_population  s[    .	R
 &&
K'@A+//D	1==  	.SZZ@	s	   6 (Ac                     SSK Jn  U R                  / SQ5      n[        U5      U;   a  U[        U5         $ UR                  5       nUR                  [        U5      S5      $ ! [         a    [        [        [
        R                  S9  e f = f)a  
Get an estimate of how many people in the world read and write
this language, derived from CLDR data. Requires that `language_data`
is installed.

For many languages that aren't typically written, this is an
overestimate, according to CLDR -- the data often includes people who
speak that language but write in a different language.

Only the language, script, and territory codes will be considered.
If a territory code is included, the population will count only the
speakers of the language in that territory.

>>> all = Language.get('zh').writing_population()
>>> all
1240841517

>>> traditional = Language.get('zh-Hant').writing_population()
>>> traditional
36863340

>>> simplified = Language.get('zh-Hans').writing_population()
>>> all == traditional + simplified
True

>>> Language.get('zh-Hant-HK').writing_population()
6439733
>>> Language.get('zh-Hans-HK').writing_population()
338933

Note that if you want to get the total Chinese writing population
of Hong Kong, you need to avoid normalization that would interpret
'zh-HK' as 'zh-Hant-HK'.

>>> Language.get('zh-HK', normalize=False).writing_population()
6778666

Unknown or unspecified language codes get a population of 0.

>>> Language.get('xyz').writing_population()
0

>>> Language.get('und').writing_population()
0
r   )LANGUAGE_WRITING_POPULATIONr   )r   r!   r"   )r  r  r   r   r   r   r   r|   r   re   rL   )r4   r  r   s      r5   writing_populationLanguage.writing_population  s    \	Q
 &&'JKt933.s4y99'')D.223t9a@@  	.SZZ@	s   A# #(Btagtypenamec                     SSK Jn  [        U[        5      (       a  UR                  nO*[        U[        5      (       a  [        U5      R                  nUc  SnU" XU5      nUc  [        SU  SU< 35      eSU;   a  [        R                  U5      $ X0n[        R                  " S0 UD6$ ! [         a    [        [        [
        R                  S9  e f = f)	u	  
Find the subtag of a particular `tagtype` that has the given `name`.
Requires that `language_data` is installed.

The default language, "und", will allow matching names in any language,
so you can get the code 'fr' by looking up "French", "Français", or
"francés".

Occasionally, names are ambiguous in a way that can be resolved by
specifying what name the language is supposed to be in. For example,
there is a language named 'Malayo' in English, but it's different from
the language named 'Malayo' in Spanish (which is Malay). Specifying the
language will look up the name in a trie that is only in that language.

In a previous version, we thought we were going to deprecate the
`language` parameter, as there weren't significant cases of conflicts
in names of things between languages. Well, we got more data, and
conflicts in names are everywhere.

Specifying the language that the name should be in is still not
required, but it will help to make sure that names can be
round-tripped.

>>> Language.find_name('language', 'francés')
Language.make(language='fr')

>>> Language.find_name('territory', 'United Kingdom')
Language.make(territory='GB')

>>> Language.find_name('script', 'Arabic')
Language.make(script='Arab')

>>> Language.find_name('language', 'norsk bokmål')
Language.make(language='nb')

>>> Language.find_name('language', 'norsk')
Language.make(language='no')

>>> Language.find_name('language', 'norsk', 'en')
Traceback (most recent call last):
    ...
LookupError: Can't find any language named 'norsk'

>>> Language.find_name('language', 'norsk', 'no')
Language.make(language='no')

>>> Language.find_name('language', 'malayo', 'en')
Language.make(language='mbp')

>>> Language.find_name('language', 'malayo', 'es')
Language.make(language='ms')

Some langauge names resolve to more than a language. For example,
the name 'Brazilian Portuguese' resolves to a language and a territory,
and 'Simplified Chinese' resolves to a language and a script. In these
cases, a Language object with multiple subtags will be returned.

>>> Language.find_name('language', 'Brazilian Portuguese', 'en')
Language.make(language='pt', territory='BR')

>>> Language.find_name('language', 'Simplified Chinese', 'en')
Language.make(language='zh', script='Hans')

A small amount of fuzzy matching is supported: if the name can be
shortened to match a single language name, you get that language.
This allows, for example, "Hakka dialect" to match "Hakka".

>>> Language.find_name('language', 'Hakka dialect')
Language.make(language='hak')
r   )name_to_coder   rI   zCan't find any z named rE   r;   )r   r  r   r   r   r   r   rK   r   r   r   rL   rv   r@   )r	  r
  r   r  coderS   s         r5   	find_nameLanguage.find_name  s    T	8 h))((H#&&8}--HHG84<yxHII$;<<%%?D==(4(()  	.SZZ@	s   B+ +(Cc                 .    [         R                  SX5      $ )u\  
A concise version of `find_name`, used to get a language tag by its
name in a natural language. The language can be omitted in the large
majority of cases, where the language name is not ambiguous.

>>> Language.find('Türkçe')
Language.make(language='tr')
>>> Language.find('brazilian portuguese')
Language.make(language='pt', territory='BR')
>>> Language.find('simplified chinese')
Language.make(language='zh', script='Hans')

Some language names are ambiguous: for example, there is a language
named 'Fala' in English (with code 'fax'), but 'Fala' is also the
Kwasio word for French. In this case, specifying the language that
the name is in is necessary for disambiguation.

>>> Language.find('fala')
Language.make(language='fr')
>>> Language.find('fala', 'nmg')
Language.make(language='fr')
>>> Language.find('fala', 'en')
Language.make(language='fax')
r   )r   r  )r
  r   s     r5   findLanguage.findu  s    8 !!*d==r8   c                     U R                   b  U R                   $ 0 nU R                   H  n[        X5      nU(       d  M  X1U'   M     Xl         U$ )zt
Get a dictionary of the attributes of this Language object, which
can be useful for constructing a similar object.
)r0   r   r   )r4   r[   r   rW   s       r5   rN   Language.to_dict  sM    
 ::!::??CD&Eu#s # 
r8   c           
         [         R                  UR                  =(       d    U R                  UR                  =(       d    U R                  UR                  =(       d    U R                  UR
                  =(       d    U R
                  UR                  =(       d    U R                  UR                  =(       d    U R                  UR                  =(       d    U R                  S9$ )z;
Update this Language with the fields of another Language.
r   )	r   r@   r   r    r!   r"   r#   r$   r%   r4   others     r5   rM   Language.update  s     }}^^4t}}^^4t}}<<.4;;oo7^^4t}}'':4??MM1T\\  
 	
r8   newdatac                    [         R                  UR                  SU R                  5      UR                  SU R                  5      UR                  SU R
                  5      UR                  SU R                  5      UR                  SU R                  5      UR                  SU R                  5      UR                  SU R                  5      S9$ )	z;
Update the attributes of this Language from a dictionary.
r   r    r!   r"   r#   r$   r%   r   )
r   r@   rL   r   r    r!   r"   r#   r$   r%   )r4   r  s     r5   rd   Language.update_dict  s     }}[[T]];[[T]];;;x5kk+t~~>[[T]];{{<AKK	4<<8  
 	
r8   dr   c                 F    U Vs0 s H  o"U ;   d  M
  X U   _M     sn$ s  snf )z,
Select a subset of keys from a dictionary.
r;   )r  r   r   s      r5   _filter_keysLanguage._filter_keys  s(    
 (,8taxsVt888s   		r   c                 n    U R                  U R                  5       U5      n[        R                  " S0 UD6$ )zC
Return a copy of this object with a subset of its attributes set.
r;   )r  rN   r   r@   )r4   r   r   s      r5   r|   Language._filter_attributes  s.     $$T\\^V<}}(x((r8   c                     U R                   b  U R                   $ U R                  1 Sk5      R                  5       R                  5       U l         U R                   $ )z
Convert a parsed language tag so that the information it contains is in
the best form for looking up information in the CLDR.
>   r!   r   r"   )r*   r|   re   rm   ri   s    r5   _searchable_formLanguage._searchable_form  sT    
 '### ##$GH_!!# 	
 r8   c                 j    XL a  g[        U[        5      (       d  gU R                  UR                  :H  $ )NTF)rK   r   r/   r  s     r5   __eq__Language.__eq__  s-    =%**}}..r8   c                 ,    [        U R                  5      $ r   )hashr/   ri   s    r5   __hash__Language.__hash__  s    DMM""r8   r   c                 L    XR                   ;   a  [        X5      $ [        U5      er   )r   r   rh   r4   r   s     r5   __getitem__Language.__getitem__  s!    //!4%%3-r8   c                 B    XR                   ;   =(       a    [        X5      $ r   )r   r   r-  s     r5   __contains__Language.__contains__  s    oo%<'$*<<r8   c                     / nU R                    H7  n[        X5      (       d  M  [        X5      nUR                  U SU< 35        M9     SR                  U5      nSU S3$ )N=z, zLanguage.make())r   r   rP   r_   )r4   itemsattrrW   joineds        r5   __repr__Language.__repr__  s`    OODt""+vQui01 $ 5!xq))r8   c                 "    U R                  5       $ r   )r3   ri   s    r5   __str__Language.__str__  s    {{}r8   )r,   r+   r0   r2   r1   r-   r.   r*   r)   r/   r$   r    r   r%   r!   r"   r#   )NNNNNNN)T)r9   r   )T)r9   zList[Language]F)	   r   )r  r   r9   r   )J__name__
__module____qualname____firstlineno____doc__r   r{   MATCHABLE_KEYSETSr&   r   r<   __annotations__r'   r   r   r   r
   r   r6   classmethodr@   staticmethodr   rL   r3   re   rj   rm   rw   r   r   r   r   fill_likely_valuesintr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   propertyr   r   dictr   r  r  r  r  rN   rM   rd   r	   r  r|   r#  r&  r*  r.  r1  r9  r<  __static_attributes__r;   r8   r5   r   r   0   s   *J 	,	[!	X		

O 	,	X	 +-JUJ&',79L$uS$Y'349 #',0 $#',0.2!%&3-& 8C=)& 	&
 C=& 8C=)& Xc]+& #&P  #',0 $#',0.2!%%3-% 8C=)% 	%
 C=% 8C=)% Xc]+% #% 
% %N dsJ' dJ d dL& &P 4'R$#LI Is IVd3i <B8
v "
8Z 
8C 
85G* 5GT 5Gc 5GnY$ Yvt <(4(4(-c:o(>(4NQ(4	(4T/S#X&/2</LO/$ ,<#BZ(#B #B 
	#BN ,<<!Z(<! <! 
	<!|"# " $C $(KC K KD ,<	@Z(	@ 	@ 
		@ ,<	CZ(	C 	C 
		C ,<	;Z(	; 	; 
		;   ,<#Z(# # 
#	#& ,<FZ(F F 
	FP>S >@9AC 9Av NR_)_)_)+3E#z/4J+K_)	_) _)B @D>>%eCO&<=>	> >:  

4 
J 
 9 9HSM 9d 9 9)# ): ) /## # s  xc49n0E'F  = = =*# * r8   r   rB   macror9   c                     [         R                  U SS9nU(       a  UR                  5       nUR                  5       R	                  5       $ )aE  
Standardize a language tag:

- Replace deprecated values with their updated versions (if those exist)
- Remove script tags that are redundant with the language
- If *macro* is True, use a macrolanguage to represent the most common
  standardized language within that macrolanguage. For example, 'cmn'
  (Mandarin) becomes 'zh' (Chinese), and 'arb' (Modern Standard Arabic)
  becomes 'ar' (Arabic).
- Format the result according to the conventions of BCP 47

Macrolanguage replacement is not required by BCP 47, but it is required
by the Unicode CLDR.

>>> standardize_tag('en_US')
'en-US'

>>> standardize_tag('en-Latn')
'en'

>>> standardize_tag('en-uk')
'en-GB'

>>> standardize_tag('eng')
'en'

>>> standardize_tag('arb-Arab', macro=True)
'ar'

>>> standardize_tag('sh-QU')
'sr-Latn-EU'

>>> standardize_tag('sgn-US')
'ase'

>>> standardize_tag('zh-cmn-hans-cn')
'zh-Hans-CN'

>>> standardize_tag('zsm', macro=True)
'ms'

>>> standardize_tag('ja-latn-hepburn')
'ja-Latn-hepburn'

>>> standardize_tag('spa-latn-mx')
'es-MX'

If the tag can't be parsed according to BCP 47, this will raise a
LanguageTagError (a subclass of ValueError):

>>> standardize_tag('spa-mx-latn')
Traceback (most recent call last):
    ...
langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
Tr   )r   rL   rm   re   r3   )rB   rO  langdatas      r5   standardize_tagrR    s@    p ||C4|0H002##%,,..r8   c                 n     [         R                  U 5      nUR                  5       $ ! [         a     gf = f)ag  
Determines whether a string is a valid language tag. This is similar to
Language.get(tag).is_valid(), but can return False in the case where
the tag doesn't parse.

>>> tag_is_valid('ja')
True
>>> tag_is_valid('jp')
False
>>> tag_is_valid('spa-Latn-MX')
True
>>> tag_is_valid('spa-MX-Latn')
False
>>> tag_is_valid('')
False
>>> tag_is_valid('C.UTF-8')
False
F)r   rL   r   r   )rB   rQ  s     r5   tag_is_validrT  D  s6    &<<$  "" s   $' 
44desiredr   c                     [         R                  " S[        5        [        R	                  U 5      n[        R	                  U5      nUR                  U5      $ )a  
DEPRECATED: use .distance() instead, which uses newer data and is _lower_
for better matching languages.

Return a number from 0 to 100 indicating the strength of match between the
language the user desires, D, and a supported language, S. Higher numbers
are better. A reasonable cutoff for not messing with your users is to
only accept scores of 75 or more.

A score of 100 means the languages are the same, possibly after normalizing
and filling in likely values.
ztag_match_score is deprecated because it's based on deprecated CLDR info. Use tag_distance instead, which is _lower_ for better matching languages. )r   r   r   r   rL   r   )rU  r   
desired_ldsupported_lds       r5   tag_match_scorerY  ^  sH     MM	U
 g&J<<	*L!!,//r8   r   c                 x    [         R                  U 5      n[         R                  U5      nUR                  XB5      $ )u  
Tags that expand to the same thing when likely values are filled in get a
distance of 0.

>>> tag_distance('en', 'en')
0
>>> tag_distance('en', 'en-US')
0
>>> tag_distance('zh-Hant', 'zh-TW')
0
>>> tag_distance('ru-Cyrl', 'ru')
0

As a specific example, Serbo-Croatian is a politically contentious idea,
but in CLDR, it's considered equivalent to Serbian in Latin characters.

>>> tag_distance('sh', 'sr-Latn')
0

... which is very similar to Croatian but sociopolitically not the same.

>>> tag_distance('sh', 'hr')
9

Unicode reorganized its distinction between 'no' (Norwegian) and 'nb'
(Norwegian Bokmål) in 2021. 'no' is preferred in most contexts, and the more
specific 'nb' is a distance of 1 from it:

>>> tag_distance('nb', 'no')
1

These distances can be asymmetrical: this data includes the fact that speakers
of Swiss German (gsw) know High German (de), but not at all the other way around.

The difference seems a little bit extreme, but the asymmetry is certainly
there. And if your text is tagged as 'gsw', it must be that way for a
reason.

>>> tag_distance('gsw', 'de')
8
>>> tag_distance('de', 'gsw')
84

Unconnected languages get a distance of 80 to 134.

>>> tag_distance('en', 'zh')
134
>>> tag_distance('es', 'fr')
84
>>> tag_distance('fr-CH', 'de-CH')
80

Different local variants of the same language get a distance from 3 to 5.
>>> tag_distance('zh-HK', 'zh-MO')   # Chinese is similar in Hong Kong and Macao
4
>>> tag_distance('en-AU', 'en-GB')   # Australian English is similar to British English
3
>>> tag_distance('en-IN', 'en-GB')   # Indian English is also similar to British English
3
>>> tag_distance('es-PE', 'es-419')  # Peruvian Spanish is Latin American Spanish
1
>>> tag_distance('es-419', 'es-PE')  # but Latin American Spanish is not necessarily Peruvian
4
>>> tag_distance('es-ES', 'es-419')  # Spanish in Spain is further from Latin American Spanish
5
>>> tag_distance('en-US', 'en-GB')   # American and British English are somewhat different
5
>>> tag_distance('es-MX', 'es-ES')   # Mexican Spanish is different from Spanish Spanish
5
>>> # European Portuguese is different from the most common form (Brazilian Portuguese)
>>> tag_distance('pt', 'pt-PT')
5

>>> # Serbian has two scripts, and people might prefer one but understand both
>>> tag_distance('sr-Latn', 'sr-Cyrl')
5

A distance of 10 is used for matching a specific language to its
more-commonly-used macrolanguage tag.

>>> tag_distance('arz', 'ar')  # Egyptian Arabic to Modern Standard Arabic
10
>>> tag_distance('wuu', 'zh')  # Wu Chinese to (Mandarin) Chinese
10

Higher distances can arrive due to particularly contentious differences in
the script for writing the language, where people who understand one script
can learn the other but may not be happy with it. This specifically applies
to Chinese.

>>> tag_distance('zh-TW', 'zh-CN')
54
>>> tag_distance('zh-Hans', 'zh-Hant')
54
>>> tag_distance('zh-CN', 'zh-HK')
54
>>> tag_distance('zh-CN', 'zh-TW')
54
>>> tag_distance('zh-Hant', 'zh-Hans')
54

This distance range also applies to the differences between Norwegian
Bokmål, Nynorsk, and Danish.

>>> tag_distance('no', 'da')
12
>>> tag_distance('no', 'nn')
20

Differences of 20 to 50 can represent substantially different languages,
in cases where speakers of the first may understand the second for demographic
reasons.

>>> tag_distance('eu', 'es')  # Basque to Spanish
20
>>> tag_distance('af', 'nl')  # Afrikaans to Dutch
24
>>> tag_distance('mr', 'hi')  # Marathi to Hindi
30
>>> tag_distance('ms', 'id')  # Malay to Indonesian
34
>>> tag_distance('mg', 'fr')  # Malagasy to French
34
>>> tag_distance('ta', 'en')  # Tamil to English
44

A complex example is the tag 'yue' for Cantonese. Written Chinese is usually
presumed to be Mandarin Chinese, but colloquial Cantonese can be written as
well. (Some things could not be written any other way, such as Cantonese
song lyrics.)

The difference between Cantonese and Mandarin also implies script and
territory differences by default, adding to the distance.

>>> tag_distance('yue', 'zh')
64

When the supported script is a different one than desired, this is usually
a major difference with score of 50 or more.

>>> tag_distance('ja', 'ja-Latn-US-hepburn')
54

If `ignore_script` is used, the script difference is ignored and a smaller
difference with lower score will be found.

>>> tag_distance('ja', 'ja-Latn-hepburn', ignore_script=True)
0

>>> # You can read the Shavian script, right?
>>> tag_distance('en', 'en-Shaw')
54
)r   rL   r   )rU  r   r   desired_objsupported_objs        r5   tag_distancer]  w  s2    t ,,w'KLL+M==r8   desired_languagesupported_languages	min_scorec                 L    SU-
  n[        XU5      u  pE[        SSU-
  5      nXF4$ )aw  
DEPRECATED: use .closest_match() instead. This function emulates the old
matching behavior by subtracting the language distance from 100.

You have software that supports any of the `supported_languages`. You want
to use `desired_language`. This function lets you choose the right language,
even if there isn't an exact match.

Returns:

- The best-matching language code, which will be one of the
  `supported_languages` or 'und'
- The score of the match, from 0 to 100; higher is better.

`min_score` sets the minimum match score. If all languages match with a lower
score than that, the result will be 'und' with a score of 0.
r   r   )r   max)r^  r_  r`  r   r   r   r   s          r5   
best_matchrc    s:    , ?L'|I 3>"Er8   r   c           	      $   [        U 5      n X;   a  U S4$ [        U 5      n X;   a  U S4$ U Vs/ s H  nU[        XU5      4PM     nnU VVs/ s H  u  pFXb::  d  M  XF4PM     snnS/-   nUR                  [	        S5      S9  US   $ s  snf s  snnf )aT  
You have software that supports any of the `supported_languages`. You want
to use `desired_language`. This function lets you choose the right language,
even if there isn't an exact match.

Returns:

- The best-matching language code, which will be one of the
  `supported_languages` or 'und' for no match
- The distance of the match, which is 0 for a perfect match and increases
  from there (see `tag_distance`)

`max_distance` sets the maximum match distance. If all matches are farther
than that, the result will be 'und' with a distance of 1000. The default
value is 25, and raising it can cause data to be processed in significantly
the wrong language. The documentation for `tag_distance` describes the
distance values in more detail.

`ignore_script` makes the matching ignore scripts, allowing matches to be
found when they wouldn't otherwise be due to different scripts.

When there is a tie for the best matching language, the first one in the
tie will be used.

>>> closest_match('fr', ['de', 'en', 'fr'])
('fr', 0)

>>> closest_match('pt', ['pt-BR', 'pt-PT'])
('pt-BR', 0)

>>> closest_match('en-AU', ['en-GB', 'en-US'])
('en-GB', 3)

>>> closest_match('af', ['en', 'nl', 'zu'])
('nl', 24)

>>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
('und', 1000)

>>> closest_match('ja', ['ja-Latn-hepburn', 'en'], ignore_script=True)
('ja-Latn-hepburn', 0)
r   )rI     r   )r   )r   rR  r]  sortr   )r^  r_  r   r   r   match_distancesr   s          r5   r   r   4  s    ` +, ."" ''78."" -,I 
L!1mLM,   &5%4!Y# 	%4 
	O Z]+1s   BBBc                 0    [        XU5      u  p4US:X  a  gU$ )a  
Wraps `closest_match` with a simpler return type. Returns the language
tag of the closest match if there is one, or None if there is not.

>>> closest_supported_match('fr', ['de', 'en', 'fr'])
'fr'

>>> closest_supported_match('pt', ['pt-BR', 'pt-PT'])
'pt-BR'

>>> closest_supported_match('en-AU', ['en-GB', 'en-US'])
'en-GB'

>>> closest_supported_match('und', ['en', 'und'])
'und'

>>> closest_supported_match('af', ['en', 'nl', 'zu'])
'nl'

>>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10))
None
re  N)r   )r^  r_  r   r  r   s        r5   closest_supported_matchri  }  s#    6 ##3,WND4r8   r?  )K   )r   F)r   )0rE  operatorr   typingr   r   r   r   r   r	   r
   r   r   r   r   langcodes.tag_parserr   r   r   langcodes.language_distancer   langcodes.data_dictsr   r   r   r   r   r   r   r   r   r   r   r   rL   r  r  LanguageDatar   r   rR  rT  rK  rY  r]  rc  r   ri  r;   r8   r5   <module>rq     s     W W W  
 R R =
 
 
    I IZ. ll}}	 </sH}- </d </s </~eCM* t 403=!0.3CM.B002\>%X. \>5h;O \>`d \>qt \>D CM*!#  38_	B 	FCM*F!#F F 	F
 38_FX CM*!#  c]	r8   