Source: lib/util/language_utils.js

  1. /*! @license
  2. * Shaka Player
  3. * Copyright 2016 Google LLC
  4. * SPDX-License-Identifier: Apache-2.0
  5. */
  6. goog.provide('shaka.util.LanguageUtils');
  7. goog.require('goog.asserts');
  8. goog.require('shaka.util.ManifestParserUtils');
  9. /**
  10. * @summary A set of language utility functions.
  11. * @final
  12. * @export
  13. */
  14. shaka.util.LanguageUtils = class {
  15. /**
  16. * Check if |locale1| and |locale2| are locale-compatible.
  17. *
  18. * Locale-compatible is defined as all components in each locale match. Since
  19. * we only respect the language and region components, we only check that
  20. * the language and region components match.
  21. *
  22. * Examples:
  23. * Locale A | Locale B | Locale Compatible
  24. * ---------------------------------------
  25. * en-US | en-US | true
  26. * en | en-US | false
  27. * en-US | en-CA | false
  28. *
  29. * @param {string} locale1
  30. * @param {string} locale2
  31. * @return {boolean}
  32. * @export
  33. */
  34. static areLocaleCompatible(locale1, locale2) {
  35. const LanguageUtils = shaka.util.LanguageUtils;
  36. // Even through they SHOULD already be normalized, let's just be safe and
  37. // do it again.
  38. locale1 = LanguageUtils.normalize(locale1);
  39. locale2 = LanguageUtils.normalize(locale2);
  40. return locale1 == locale2;
  41. }
  42. /**
  43. * Check if |locale1| and |locale2| are language-compatible.
  44. *
  45. * Language compatible is when the language component of each locale matches.
  46. * This means that no matter what region they have (or don't have) as long as
  47. * the language components match, they are language-compatible.
  48. *
  49. * Examples:
  50. * Locale A | Locale B | Language-Compatible
  51. * -----------------------------------------
  52. * en-US | en-US | true
  53. * en-US | en | true
  54. * en-US | en-CA | true
  55. * en-CA | fr-CA | false
  56. *
  57. * @param {string} locale1
  58. * @param {string} locale2
  59. * @return {boolean}
  60. * @export
  61. */
  62. static areLanguageCompatible(locale1, locale2) {
  63. const LanguageUtils = shaka.util.LanguageUtils;
  64. // Even through they SHOULD already be normalized, let's just be safe and
  65. // do it again.
  66. locale1 = LanguageUtils.normalize(locale1);
  67. locale2 = LanguageUtils.normalize(locale2);
  68. // Get all components. This should only be language and region
  69. // since we do not support dialect.
  70. /** @type {!Array<string>} */
  71. const locale1Components = LanguageUtils.disassembleLocale_(locale1);
  72. /** @type {!Array<string>} */
  73. const locale2Components = LanguageUtils.disassembleLocale_(locale2);
  74. // We are language compatible if we have the same language.
  75. return locale1Components[0] == locale2Components[0];
  76. }
  77. /**
  78. * Check if |possibleParent| is the parent locale of |possibleChild|. Because
  79. * we do not support dialects, the parent-child relationship is a lot simpler.
  80. * In a parent child relationship:
  81. * - The parent and child have the same language-component
  82. * - The parent has no region-component
  83. * - The child has a region-component
  84. *
  85. * Example:
  86. * Locale A | Locale B | Is A The parent of B?
  87. * --------------------------------------------
  88. * en-US | en-US | no
  89. * en-US | en | no
  90. * en | en-US | yes
  91. * en | en | no
  92. * en | fr | no
  93. *
  94. * @param {string} possibleParent
  95. * @param {string} possibleChild
  96. * @return {boolean}
  97. * @export
  98. */
  99. static isParentOf(possibleParent, possibleChild) {
  100. const LanguageUtils = shaka.util.LanguageUtils;
  101. // Even through they SHOULD already be normalized, let's just be safe and
  102. // do it again.
  103. possibleParent = LanguageUtils.normalize(possibleParent);
  104. possibleChild = LanguageUtils.normalize(possibleChild);
  105. // Get all components. This should only be language and region
  106. // since we do not support dialect.
  107. /** @type {!Array<string>} */
  108. const possibleParentComponents =
  109. LanguageUtils.disassembleLocale_(possibleParent);
  110. /** @type {!Array<string>} */
  111. const possibleChildComponents =
  112. LanguageUtils.disassembleLocale_(possibleChild);
  113. return possibleParentComponents[0] == possibleChildComponents[0] &&
  114. possibleParentComponents.length == 1 &&
  115. possibleChildComponents.length == 2;
  116. }
  117. /**
  118. * Check if |localeA| shares the same parent with |localeB|. Since we don't
  119. * support dialect, we will only look at language and region. For two locales
  120. * to be siblings:
  121. * - Both must have language-components
  122. * - Both must have region-components
  123. * - Both must have the same language-component
  124. *
  125. * Example:
  126. * Locale A | Locale B | Siblings?
  127. * --------------------------------------------
  128. * en-US | en-US | yes
  129. * en-US | en-CA | yes
  130. * en-US | en | no
  131. * en | en-US | no
  132. * en | en | no
  133. * en | fr | no
  134. *
  135. * @param {string} localeA
  136. * @param {string} localeB
  137. * @return {boolean}
  138. * @export
  139. */
  140. static isSiblingOf(localeA, localeB) {
  141. const LanguageUtils = shaka.util.LanguageUtils;
  142. // Even through they SHOULD already be normalized, let's just be safe and
  143. // do it again.
  144. localeA = LanguageUtils.normalize(localeA);
  145. localeB = LanguageUtils.normalize(localeB);
  146. // Get all components. This should only be language and region
  147. // since we do not support dialect.
  148. /** @type {!Array<string>} */
  149. const localeAComponents = LanguageUtils.disassembleLocale_(localeA);
  150. /** @type {!Array<string>} */
  151. const localeBComponents = LanguageUtils.disassembleLocale_(localeB);
  152. return localeAComponents.length == 2 &&
  153. localeBComponents.length == 2 &&
  154. localeAComponents[0] == localeBComponents[0];
  155. }
  156. /**
  157. * Normalize a locale. This will take a locale and canonicalize it to a state
  158. * that we are prepared to work with.
  159. *
  160. * We only support with:
  161. * - language
  162. * - language-REGION
  163. *
  164. * If given a dialect, we will discard it. We will convert any 3-character
  165. * codes to 2-character codes. We will force language codes to lowercase and
  166. * region codes to uppercase.
  167. *
  168. * @param {string} locale
  169. * @return {string}
  170. * @export
  171. */
  172. static normalize(locale) {
  173. const LanguageUtils = shaka.util.LanguageUtils;
  174. const privateUsePrefix = 'x-';
  175. const [languageRegion = '', privateUseSuffix = ''] =
  176. locale.split(`-${privateUsePrefix}`);
  177. const [languageCode = '', regionCode = ''] = languageRegion.split('-');
  178. // We are only going to use the language, the region and the private use part (as per https://datatracker.ietf.org/doc/html/rfc5646).
  179. // Anything else is thrown away.
  180. const privateUse = privateUseSuffix ?
  181. `${privateUsePrefix}${privateUseSuffix}` : '';
  182. // Convert the language to lower case. It is standard for the language code
  183. // to be in lower case, but it will also make the map look-up easier.
  184. let language = languageCode.toLowerCase();
  185. language = LanguageUtils.isoMap_.get(language) || language;
  186. // Convert the region to upper case. It is standard for the region to be in
  187. // upper case. If there is no upper code, then it will be an empty string
  188. // and this will be a no-op.
  189. const region = regionCode.toUpperCase();
  190. return `${region ? `${language}-${region}` : language}${
  191. privateUse ? `-${privateUse}` : ''}`;
  192. }
  193. /**
  194. * Check if two language codes are siblings. Language codes are siblings if
  195. * they share the same base language while neither one is the base language.
  196. *
  197. * For example, "en-US" and "en-CA" are siblings but "en-US" and "en" are not
  198. * siblings.
  199. *
  200. * @param {string} a
  201. * @param {string} b
  202. * @return {boolean}
  203. * @export
  204. */
  205. static areSiblings(a, b) {
  206. const LanguageUtils = shaka.util.LanguageUtils;
  207. const baseA = LanguageUtils.getBase(a);
  208. const baseB = LanguageUtils.getBase(b);
  209. return a != baseA && b != baseB && baseA == baseB;
  210. }
  211. /**
  212. * Compute a numerical relatedness for language codes. Language codes with a
  213. * higher relatedness are a better match. Unrelated language codes have a
  214. * relatedness score of 0.
  215. *
  216. * @param {string} target
  217. * @param {string} candidate
  218. * @return {number}
  219. * @export
  220. */
  221. static relatedness(target, candidate) {
  222. const LanguageUtils = shaka.util.LanguageUtils;
  223. target = LanguageUtils.normalize(target);
  224. candidate = LanguageUtils.normalize(candidate);
  225. // An exact match is the top score.
  226. if (candidate == target) {
  227. return 4;
  228. }
  229. // Next is a parent of the target language.
  230. if (LanguageUtils.isParentOf(candidate, target)) {
  231. return 3;
  232. }
  233. // Next is a sibling of the target language.
  234. if (LanguageUtils.isSiblingOf(candidate, target)) {
  235. return 2;
  236. }
  237. // Next is a child of the target language.
  238. if (LanguageUtils.isParentOf(target, candidate)) {
  239. return 1;
  240. }
  241. // Otherwise, they are unrelated.
  242. return 0;
  243. }
  244. /**
  245. * Get the normalized base language for a language code.
  246. *
  247. * @param {string} lang
  248. * @return {string}
  249. * @export
  250. */
  251. static getBase(lang) {
  252. const LanguageUtils = shaka.util.LanguageUtils;
  253. const splitAt = lang.indexOf('-');
  254. let major;
  255. if (splitAt >= 0) {
  256. major = lang.substring(0, splitAt);
  257. } else {
  258. major = lang;
  259. }
  260. // Convert the major code to lower case. It is standard for the major code
  261. // to be in lower case, but it will also make the map look-up easier.
  262. major = major.toLowerCase();
  263. major = LanguageUtils.isoMap_.get(major) || major;
  264. return major;
  265. }
  266. /**
  267. * Get the normalized language of the given text stream. Will return 'und' if
  268. * a language is not found on the text stream.
  269. *
  270. * This should always be used to get the language from a text stream.
  271. *
  272. * @param {shaka.extern.Stream} stream
  273. * @return {string}
  274. * @export
  275. */
  276. static getLocaleForText(stream) {
  277. const LanguageUtils = shaka.util.LanguageUtils;
  278. const ContentType = shaka.util.ManifestParserUtils.ContentType;
  279. goog.asserts.assert(
  280. stream.type == ContentType.TEXT,
  281. 'Can only get language from text streams');
  282. const language = stream.language || 'und';
  283. return LanguageUtils.normalize(language);
  284. }
  285. /**
  286. * Get the normalized locale for the given variant. This will look through
  287. * the variant to find the locale that represents the content in the variant.
  288. * This will return 'und' if no language can be found.
  289. *
  290. * This should always be used to get the locale from a variant.
  291. *
  292. * @param {shaka.extern.Variant} variant
  293. * @return {string}
  294. * @export
  295. */
  296. static getLocaleForVariant(variant) {
  297. const LanguageUtils = shaka.util.LanguageUtils;
  298. // Our preference order is:
  299. // 1. Variant
  300. // 2. Audio Stream
  301. // 3. Video Stream
  302. //
  303. // We are going to consider all falsy strings to be invalid locales, this
  304. // will include empty strings.
  305. if (variant.language) {
  306. return LanguageUtils.normalize(variant.language);
  307. }
  308. if (variant.audio && variant.audio.language) {
  309. return LanguageUtils.normalize(variant.audio.language);
  310. }
  311. if (variant.video && variant.video.language) {
  312. return LanguageUtils.normalize(variant.video.language);
  313. }
  314. // No language was found, but we still want to return a valid string.
  315. return 'und';
  316. }
  317. /**
  318. * Find the locale in |searchSpace| that comes closest to |target|. If no
  319. * locale is found to be close to |target|, then |null| will be returned.
  320. *
  321. * @param {string} target
  322. * @param {!Iterable<string>} searchSpace
  323. * @return {?string}
  324. * @export
  325. */
  326. static findClosestLocale(target, searchSpace) {
  327. const LanguageUtils = shaka.util.LanguageUtils;
  328. /** @type {string} */
  329. const safeTarget = LanguageUtils.normalize(target);
  330. /** @type {!Set<string>} */
  331. const safeSearchSpace = new Set();
  332. for (const option of searchSpace) {
  333. safeSearchSpace.add(LanguageUtils.normalize(option));
  334. }
  335. // Preference 1 - The option is an exact match. For example, "en-US" is an
  336. // exact match of "en-US". So if there is an option that is an exact
  337. // match, it would be the best match possible.
  338. for (const option of safeSearchSpace) {
  339. if (option == safeTarget) {
  340. return option;
  341. }
  342. }
  343. // Preference 2 - The option is the parent of the target. For example,
  344. // "en" is the parent of "en-US". So if there is an option with
  345. // "en", it should be good enough when our preference is "en-US".
  346. for (const option of safeSearchSpace) {
  347. if (LanguageUtils.isParentOf(option, safeTarget)) {
  348. return option;
  349. }
  350. }
  351. // Preference 3 - The option is a sibling of the target. For example,
  352. // "en-US" is a sibling of "en-CA". So if there is an option with
  353. // "en_CA", it should be good enough when our preference is "en-US".
  354. for (const option of safeSearchSpace) {
  355. if (LanguageUtils.isSiblingOf(option, safeTarget)) {
  356. return option;
  357. }
  358. }
  359. // Preference 4 - The option is a child of the target. For example,
  360. // "en-US" is the child of "en". SO it there is an option with
  361. // "en-US", it should be good enough when our preference is "en".
  362. for (const option of safeSearchSpace) {
  363. if (LanguageUtils.isParentOf(safeTarget, option)) {
  364. return option;
  365. }
  366. }
  367. // Failed to find anything.
  368. return null;
  369. }
  370. /**
  371. * Take a locale string and break it into its component. Check that each
  372. * component matches what we would expect internally for locales. This
  373. * should ONLY be used to verify locales that have been normalized.
  374. *
  375. * @param {string} locale
  376. * @return {!Array<string>}
  377. * @private
  378. */
  379. static disassembleLocale_(locale) {
  380. const components = locale.split('-');
  381. goog.asserts.assert(
  382. components.length <= 2,
  383. [
  384. 'Locales should not have more than 2 components. ',
  385. locale,
  386. ' has too many components.',
  387. ].join());
  388. return components;
  389. }
  390. };
  391. /**
  392. * A map from 3-letter language codes (ISO 639-2) to 2-letter language codes
  393. * (ISO 639-1) for all languages which have both in the registry.
  394. *
  395. * @const {!Map<string, string>}
  396. * @private
  397. */
  398. shaka.util.LanguageUtils.isoMap_ = new Map([
  399. ['aar', 'aa'], ['abk', 'ab'], ['afr', 'af'], ['aka', 'ak'], ['alb', 'sq'],
  400. ['amh', 'am'], ['ara', 'ar'], ['arg', 'an'], ['arm', 'hy'], ['asm', 'as'],
  401. ['ava', 'av'], ['ave', 'ae'], ['aym', 'ay'], ['aze', 'az'], ['bak', 'ba'],
  402. ['bam', 'bm'], ['baq', 'eu'], ['bel', 'be'], ['ben', 'bn'], ['bih', 'bh'],
  403. ['bis', 'bi'], ['bod', 'bo'], ['bos', 'bs'], ['bre', 'br'], ['bul', 'bg'],
  404. ['bur', 'my'], ['cat', 'ca'], ['ces', 'cs'], ['cha', 'ch'], ['che', 'ce'],
  405. ['chi', 'zh'], ['chu', 'cu'], ['chv', 'cv'], ['cor', 'kw'], ['cos', 'co'],
  406. ['cre', 'cr'], ['cym', 'cy'], ['cze', 'cs'], ['dan', 'da'], ['deu', 'de'],
  407. ['div', 'dv'], ['dut', 'nl'], ['dzo', 'dz'], ['ell', 'el'], ['eng', 'en'],
  408. ['epo', 'eo'], ['est', 'et'], ['eus', 'eu'], ['ewe', 'ee'], ['fao', 'fo'],
  409. ['fas', 'fa'], ['fij', 'fj'], ['fin', 'fi'], ['fra', 'fr'], ['fre', 'fr'],
  410. ['fry', 'fy'], ['ful', 'ff'], ['geo', 'ka'], ['ger', 'de'], ['gla', 'gd'],
  411. ['gle', 'ga'], ['glg', 'gl'], ['glv', 'gv'], ['gre', 'el'], ['grn', 'gn'],
  412. ['guj', 'gu'], ['hat', 'ht'], ['hau', 'ha'], ['heb', 'he'], ['her', 'hz'],
  413. ['hin', 'hi'], ['hmo', 'ho'], ['hrv', 'hr'], ['hun', 'hu'], ['hye', 'hy'],
  414. ['ibo', 'ig'], ['ice', 'is'], ['ido', 'io'], ['iii', 'ii'], ['iku', 'iu'],
  415. ['ile', 'ie'], ['ina', 'ia'], ['ind', 'id'], ['ipk', 'ik'], ['isl', 'is'],
  416. ['ita', 'it'], ['jav', 'jv'], ['jpn', 'ja'], ['kal', 'kl'], ['kan', 'kn'],
  417. ['kas', 'ks'], ['kat', 'ka'], ['kau', 'kr'], ['kaz', 'kk'], ['khm', 'km'],
  418. ['kik', 'ki'], ['kin', 'rw'], ['kir', 'ky'], ['kom', 'kv'], ['kon', 'kg'],
  419. ['kor', 'ko'], ['kua', 'kj'], ['kur', 'ku'], ['lao', 'lo'], ['lat', 'la'],
  420. ['lav', 'lv'], ['lim', 'li'], ['lin', 'ln'], ['lit', 'lt'], ['ltz', 'lb'],
  421. ['lub', 'lu'], ['lug', 'lg'], ['mac', 'mk'], ['mah', 'mh'], ['mal', 'ml'],
  422. ['mao', 'mi'], ['mar', 'mr'], ['may', 'ms'], ['mkd', 'mk'], ['mlg', 'mg'],
  423. ['mlt', 'mt'], ['mon', 'mn'], ['mri', 'mi'], ['msa', 'ms'], ['mya', 'my'],
  424. ['nau', 'na'], ['nav', 'nv'], ['nbl', 'nr'], ['nde', 'nd'], ['ndo', 'ng'],
  425. ['nep', 'ne'], ['nld', 'nl'], ['nno', 'nn'], ['nob', 'nb'], ['nor', 'no'],
  426. ['nya', 'ny'], ['oci', 'oc'], ['oji', 'oj'], ['ori', 'or'], ['orm', 'om'],
  427. ['oss', 'os'], ['pan', 'pa'], ['per', 'fa'], ['pli', 'pi'], ['pol', 'pl'],
  428. ['por', 'pt'], ['pus', 'ps'], ['que', 'qu'], ['roh', 'rm'], ['ron', 'ro'],
  429. ['rum', 'ro'], ['run', 'rn'], ['rus', 'ru'], ['sag', 'sg'], ['san', 'sa'],
  430. ['sin', 'si'], ['slk', 'sk'], ['slo', 'sk'], ['slv', 'sl'], ['sme', 'se'],
  431. ['smo', 'sm'], ['sna', 'sn'], ['snd', 'sd'], ['som', 'so'], ['sot', 'st'],
  432. ['spa', 'es'], ['sqi', 'sq'], ['srd', 'sc'], ['srp', 'sr'], ['ssw', 'ss'],
  433. ['sun', 'su'], ['swa', 'sw'], ['swe', 'sv'], ['tah', 'ty'], ['tam', 'ta'],
  434. ['tat', 'tt'], ['tel', 'te'], ['tgk', 'tg'], ['tgl', 'tl'], ['tha', 'th'],
  435. ['tib', 'bo'], ['tir', 'ti'], ['ton', 'to'], ['tsn', 'tn'], ['tso', 'ts'],
  436. ['tuk', 'tk'], ['tur', 'tr'], ['twi', 'tw'], ['uig', 'ug'], ['ukr', 'uk'],
  437. ['urd', 'ur'], ['uzb', 'uz'], ['ven', 've'], ['vie', 'vi'], ['vol', 'vo'],
  438. ['wel', 'cy'], ['wln', 'wa'], ['wol', 'wo'], ['xho', 'xh'], ['yid', 'yi'],
  439. ['yor', 'yo'], ['zha', 'za'], ['zho', 'zh'], ['zul', 'zu'],
  440. ]);