add: config.py как единый источник правды (на базе html.entities)

This commit is contained in:
2025-08-03 20:00:59 +03:00
parent cf047a2552
commit 134f3807b2
4 changed files with 243 additions and 991 deletions

View File

@@ -14,97 +14,39 @@ STRINGS_FOR_DECODE = [
("­", "\u00AD"), # Мягкий перенос
("    ‍‌", "\u00A0\u2002\u2003\u2009\u200D\u200C"), # Набор пробелов и невидимых символов
("– — ‐ ―", ""), # Набор тире и дефисов
("' « »", "' « »"), # Апостроф и ёлочки
("“ ” „", "“ ” „"), # Двойные кавычки
("‘ ’ ‚", " "), # Одиночные кавычки
("‹ ›", " "), # Французские угловые кавычки
("$ ¢ £ ¤ ¥ € ₽", "$ ¢ £ ¤ ¥ € ₽"), # Валютные символы
("+ − × ÷ = ≠", "+ × ÷ = ≠"), # Математические символы
("± ¬ ° ¹ ² ³", "± ¬ ° ¹ ² ³"),
("ƒ % ‰ ‱", "ƒ % ‰ ‱"),
("∀ ∁ ∂ ∃ ∄", "∀ ∁ ∂ ∃ ∄"),
("∅ ∇ ∈ ∉ ∋ ∌", "∅ ∇ ∈ ∉ ∋ ∌"),
("∏ ∐ ∑ ∓ ∸", "∏ ∐ ∑ ∓ ∸"),
("∔ ∖ ∗ ∘ √", " ∘ √"),
("∝ ∞ ∠ ∟ ∡ ∣", "∝ ∞ ∠ ∟ ∡ "),
("∢ ∤ ∥ ∦ ∧ ∨", "∢ ∤ ∥ ∦ ∧ "),
("∩ ∪ ∫ ∬ ∭ ∮", " ∫ ∬ ∭ ∮"),
("∯ ∰ ∱ ∲", "∯ ∰ ∱ ∲"),
("∳ ∴ ∵ ∶ ∷", "∳ ∴ ∵ "),
("∺ ∻ ∼ ∽ ∾ ∿", "∺ ∻ ∽ ∾ ∿"),
("≀ ≁ ≂ ≃ ≄ ≅", "≀ ≁ ≂ ≃ ≄ ≅"),
("≈ ≆ ≇ ≉ ≊ ≋", "≈ ≆ ≇ ≉ ≊ ≋"),
("≌ ≍ ≎ ≏ ≐ ≑", "≌ ≍ ≎ ≏ ≐ ≑"),
("≒ ≓ ≔ ≕ ≖ ≗", "≒ ≓ ≔ ≕ ≖ ≗"),
("≙ ≚ ≜ ≟ ≡", "≙ ≚ ≜ ≟ ≡"),
("≢ ≤ ≥ ≦ ≧ ≨ ≩", "≢ ≤ ≥ ≦ ≧ ≨ ≩"),
("≪ ≫ ≬ ≭ ≮ ≯", "≪ ≫ ≬ ≭ ≮ ≯"),
("≰ ≱ ≲ ≳ ≴ ≵", "≰ ≱ ≲ ≳ ≴ ≵"),
("≶ ≷ ≸ ≹ ≺ ≻", "≶ ≷ ≸ ≹ ≺ ≻"),
("≼ ≽ ≾ ≿ ⊀ ⊁", "≼ ≽ ≾ ≿ ⊀ ⊁"),
("⊂ ⊃ ⊄ ⊅ ⊆ ⊇", "⊂ ⊃ ⊄ ⊅"),
("⊈ ⊉ ⊊ ⊋ ⊍", "⊈ ⊉ ⊊ ⊋ ⊍"),
("⊎ ⊏ ⊐ ⊑ ⊒", "⊎ ⊏ ⊐ ⊑ ⊒"),
("⊓ ⊔ ⊕ ⊖ ⊗", "⊓ ⊔ ⊕ ⊖ ⊗"),
("⊘ ⊙ ⊚ ⊛ ⊝ ⊞", "⊘ ⊙ ⊚ ⊛ ⊝ ⊞"),
("⊟ ⊠ ⊡ ⊢ ⊣ ⊤", "⊟ ⊠ ⊡ ⊢ ⊣ "),
("⊥ ⊧ ⊨ ⊩ ⊪", "⊥ ⊧ ⊨ ⊩ ⊪"),
("⊫ ⊬ ⊭ ⊮ ⊯", "⊫ ⊬ ⊭ ⊮ ⊯"),
("⊰ ⊲ ⊳ ⊴ ⊵", "⊰ ⊲ ⊳ ⊴ ⊵"),
("⊶ ⊷ ⊸ ⊹ ⊺", "⊶ ⊷ ⊸ ⊹ ⊺"),
("⊻ ⊽ ⊾ ⊿ ⋀", "⊻ ⊽ ⊾ ⊿ ⋀"),
("⋁ ⋂ ⋃ ⋄ ⋅ ⋆", " ⋄ ⋅ ⋆"),
("⋇ ⋈ ⋉ ⋊ ⋋", "⋇ ⋈ ⋉ ⋊ ⋋"),
("⋌ ⋍ ⋎ ⋏ ⋐ ⋑", "⋌ ⋍ ⋎ ⋏ ⋐ ⋑"),
("⋒ ⋓ ⋔ ⋕ ⋖ ⋗", "⋒ ⋓ ⋔ ⋕ ⋖ ⋗"),
("⋘ ⋙ ⋚ ⋛ ⋞ ⋟", "⋘ ⋙ ⋚ ⋛ ⋞ ⋟"),
("⋠ ⋡ ⋢ ⋣", "⋠ ⋡ ⋢ ⋣"),
("⪉ ⪊ ⋦ ⋧ ⋨", "⪉ ⪊ ⋦ ⋧ ⋨"),
("⋩ ⋪ ⋫ ⋬ ⋭", "⋩ ⋪ ⋫ ⋬ ⋭"),
("⋮ ⋯ ⋰ ⋱ ⋲", "⋮ ⋯ ⋰ ⋱ ⋲"),
("⋳ ⋴ ⋵ ⋶ ⋷", "⋳ ⋴ ⋵ ⋶ ⋷"),
("⋹ ⋺ ⋻ ⋼ ⋽", "⋹ ⋺ ⋻ ⋼ ⋽"),
("⋾ ⌅ ⌆ ⌈ ⌈", "⋾ ⌅ ⌆ ⌈ ⌈"),
("⌉ ⌊ ⌋ ⟨ ⟩", "⌉ ⌊ ⌋ ⟨ ⟩"),
# ("© ® ™ ℗ @", "© ® ™ ℗ @"), # Другие символы
# ("ℂ ℅ ℊ ℋ ℌ ℍ", " "),
# ("ℎ ℏ ℐ ℑ ℒ ℓ", " "),
# ("ℕ № ℘ ℙ ℚ ℛ", " № ℘ "),
# ("ℝ ℞ ℤ ℧ ℨ ℩", " ℩"),
# ("ℬ ℭ ℯ ℰ ℱ ℳ", " "),
# ("ℴ ℵ ℶ ℷ ℸ", " ℵ ℶ ℷ ℸ"),
# ("ⅅ ⅆ ⅇ ⅈ ffi ff", " ffi ff"),
# ("fi fl ★ ☆ ☎", "fi fl ★ ☆ ☎"),
# ("♀ ♂ ♠ ♣ ♥ ♦", "♀ ♂ ♠ ♣ ♥ ♦"),
# ("◊ ♪ ♭ ♮ ♯ ✓", "◊ ♪ ♭ ♮ ♯ ✓"),
# ("✗ ✠ ✶ ❘", "✗ ✠ ✶ ❘"),
# ("❲ ❳", " "),
# ("! # % ( ) *", "! # % ( ) *"), # Знаки препинания
# (", . / : ;", ", . / : ;"),
# ("? [ \ ] ^ _", "? [ \\ ] ^ _"),
# ("` { | } ˜", "` { | } ˜"),
# ("ˆ ‎ ‏ ¡ ¦ §", "ˆ \u200e \u200f ¡ ¦ §"),
# ("¨ ª ¬ ¯ ´ µ ‵", "¨ ª ¬ ¯ ´ µ "),
# ("¶ · ¸ º ¿ ‖", "¶ · ¸ º ¿ ‖"),
# ("† ‡ • ‥ …", "† ‡ • ‥ …"),
# ("‰ ‱ ′ ″ ‴", "‰ ‱ ″ ‴"),
# ("‾ ⁁ ⁃ ⁄ ⁏ ⁗", "‾ ⁏ ⁗"),
# ("½ ⅓ ¼ ⅕ ⅙", "½ ⅓ ¼ ⅕ ⅙"), # Дробные символы и знаки
# ("⅛ ⅔ ⅖ ¾ ⅗", "⅛ ⅔ ⅖ ¾ ⅗"),
# ("⅜ ⅘ ⅚ ⅝ ⅞", "⅜ ⅘ ⅚ ⅝ ⅞"),
# ("Α Β Γ Δ Ε Ζ", "Α Β Γ Δ Ε Ζ"), # Греческие символы
# ("Η Θ Ι Κ Λ Μ", "Η Θ Ι Κ Λ Μ"),
# ("Ν Ξ Ο Π Ρ Σ Τ", "Ν Ξ Ο Π Ρ Σ Τ"),
# ("Υ Φ Χ Ψ Ω α", "Υ Φ Χ Ψ Ω α"),
# ("β γ δ ε ζ η", "β γ δ ε ζ η"),
# ("θ ι κ λ μ ν", "θ ι κ λ μ ν"),
# ("ξ ο π ρ σ τ", "ξ ο π ρ σ τ"),
# ("υ φ χ ψ ω", "υ φ χ ψ ω"),
# ("ϑ ϒ ϖ", "ϑ ϒ ϖ"),
# Набор из html.entities.name2codepoint
("Æ Á Â À Α Å Ã Ä Ä", "Æ Á Â À Α Å Ã Ä Ä"),
("Β Ç Χ ‡ Δ Ð É Ê È", "Β Ç Χ ‡ Δ Ð É Ê È"),
("Ε Η Ë Γ Í Î Ì Ι Ï", "Ε Η Ë Γ Í Î Ì Ι Ï"),
("Κ Λ Μ Ñ Ν Œ Ó Ô Ò Ö", "Κ Λ Μ Ñ Ν Œ Ó Ô Ò Ö"),
("Ω Ο Ø Õ Φ Π ″ Ψ Ρ Š", "Ω Ο Ø Õ Φ Π ″ Ψ Ρ Š"),
("Σ Þ Τ Θ Ú Û Ù Υ Ü", "Σ Þ Τ Θ Ú Û Ù Υ Ü"),
("Ξ Ý Ÿ Ζ á â ´ æ à", "Ξ Ý Ÿ Ζ á â ´ æ à"),
("ℵ α & ∧ ∠ ' å ≈ ã ä", "α & ∧ ∠ ' å ≈ ã ä"),
("„ β ¦ • ∩ ç ¸ ¢ χ ˆ", "„ β ¦ • ∩ ç ¸ ¢ χ ˆ"),
("♣ ≅ © ↵ ∪ ¤ ⇓ † ↓ °", "♣ ≅ © ↵ ¤ ⇓ † ↓ °"),
("δ ♦ ÷ é ê è ∅    ", "δ ♦ ÷ é ê è ∅ \u2003 \u2002"),
("ε ≡ η ð ë € ∃ ƒ ∀ ½", "ε ≡ η ð ë € ∃ ƒ ∀ ½"),
("¼ ¾ ⁄ γ ≥ > ⇔ ↔ ♥ …", "¼ ¾ γ ≥ > ⇔ ↔ ♥ …"),
("í î ¡ ì ℑ ∞ ∫ ι ¿ ∈", "í î ¡ ì ∞ ∫ ι ¿ ∈"),
("ï κ ⇐ λ ⟨ « ← ⌈ “ ≤", "ï κ ⇐ λ ⟨ « ← ⌈ “ ≤"),
("&lfloor; &lowast; &loz; &lrm; &lsaquo; &lsquo; &lt; &macr; &mdash; &micro;", "\u200e < ¯ — µ"),
("&middot; &minus; &mu; &nabla; &nbsp; &ndash; &ne; &ni; &not; &notin;", "· μ ∇ \u00A0 ≠ ∋ ¬ ∉"),
("&nsub; &ntilde; &nu; &oacute; &ocirc; &oelig; &ograve; &oline; &omega;", "⊄ ñ ν ó ô œ ò ‾ ω"),
("&omicron; &oplus; &or; &ordf; &ordm; &oslash; &otilde; &otimes; &ouml;", "ο ª º ø õ ⊗ ö"),
("&para; &part; &permil; &perp; &phi; &pi; &piv; &plusmn; &pound; &prime; &prod;", "¶ ∂ ‰ ⊥ φ π ϖ ± £ "),
("&prop; &psi; &quot; &rArr; &radic; &rang; &raquo; &rarr; &rceil; &rdquo;", "∝ ψ \" ⇒ √ ⟩ » → ⌉ ”"),
("&real; &reg; &rfloor; &rho; &rlm; &rsaquo; &rsquo; &sbquo; &scaron;", " ® ⌋ ρ \u200f š"),
("&sdot; &sect; &shy; &sigma; &sigmaf; &sim; &spades; &sub; &sube; &sum;", "⋅ § \u00AD σ ς ♠ ⊂"),
("&sup; &sup1; &sup2; &sup3; &supe; &szlig; &tau; &there4; &theta; &thetasym;", "⊃ ¹ ² ³ ⊇ ß τ ∴ θ ϑ"),
("&thinsp; &thorn; &tilde; &times; &trade; &uArr; &uacute; &uarr; &ucirc;", "\u2009 þ ˜ × ™ ⇑ ú ↑ û"),
("&ugrave; &uml; &upsih; &upsilon; &uuml; &weierp; &xi; &yacute; &yen; &yuml;", "ù ¨ ϒ υ ü ℘ ξ ý ¥ ÿ"),
("&zeta; &zwj; &zwnj; &plus; &equals; &percnt;", "ζ \u200D \u200C + = %"),
# Набор из html.entities.html5
("", ""),
("", ""),
("", ""),
]
@pytest.mark.parametrize("input_string, expected_output", STRINGS_FOR_DECODE)
@@ -118,8 +60,49 @@ def test_html_mnemo_to_utf(input_string, expected_output):
assert actual_output == expected_output
@pytest.mark.parametrize("expected_output, input_string", STRINGS_FOR_DECODE)
def test_utf_to_html_mnemo(expected_output, input_string):
STRINGS_FOR_ENCODE = [
# Тестовые строки для декодирования html-метаккода в uft-8
("", ""), # Пустая строка
("Hello world!", "Hello world!"), # Строка
("Привет типограф!", "Привет типограф!"), # Строка русского текста
("< > & \"", "&lt; &gt; &amp; &quot;"), # Самый простой набор HTML-мнемоников
("\u00AD", "&shy;"), # Мягкий перенос
("\u00A0\u2002\u2003\u2009\u200D\u200C", "&nbsp;&ensp;&emsp;&thinsp;&zwj;&zwnj;"), # Набор пробелов и невидимых символов
("", "&ndash; &mdash; &hyphen; &horbar;"), # Набор тире и дефисов
("$ ¢ £ ¤ ¥ € ₽", "&dollar; &cent; &pound; &curren; &yen; &euro; &#8381;"), # Валютные символы
# Набор из html.entities.name2codepoint
("Æ Á Â À Α Å Ã Ä Ä", "Æ &Aacute; &Acirc; &Agrave; &Alpha; &Aring; &Atilde; &Auml; &Auml;"),
("Β Ç Χ ‡ Δ Ð É Ê È", "&Beta; &Ccedil; &Chi; &Dagger; &Delta; &ETH; &Eacute; &Ecirc; &Egrave;"),
("Ε Η Ë Γ Í Î Ì Ι Ï", "&Epsilon; &Eta; &Euml; &Gamma; &Iacute; &Icirc; &Igrave; &Iota; &Iuml;"),
("Κ Λ Μ Ñ Ν Œ Ó Ô Ò Ö", "&Kappa; &Lambda; &Mu; &Ntilde; &Nu; Œ &Oacute; &Ocirc; &Ograve; &Ouml;"),
("Ω Ο Ø Õ Φ Π ″ Ψ Ρ Š", "&Omega; &Omicron; &Oslash; &Otilde; &Phi; &Pi; &Prime; &Psi; &Rho; &Scaron;"),
("Σ Þ Τ Θ Ú Û Ù Υ Ü", "&Sigma; &THORN; &Tau; &Theta; &Uacute; &Ucirc; &Ugrave; &Upsilon; &Uuml;"),
("Ξ Ý Ÿ Ζ á â ´ æ à", "&Xi; &Yacute; &Yuml; &Zeta; &aacute; &acirc; &acute; æ &agrave;"),
("α & ∧ ∠ ' å ≈ ã ä", "&alefsym; &alpha; &amp; &and; &ang; &apos; &aring; &asymp; &atilde; &auml;"),
("„ β ¦ • ∩ ç ¸ ¢ χ ˆ", "&bdquo; &beta; &brvbar; &bull; &cap; &ccedil; &cedil; &cent; &chi; &circ;"),
("♣ ≅ © ↵ ¤ ⇓ † ↓ °", "&clubs; &cong; &copy; &crarr; &cup; &curren; &dArr; &dagger; &darr; &deg;"),
("δ ♦ ÷ é ê è ∅ \u2003 \u2002", "&delta; &diams; &divide; &eacute; &ecirc; &egrave; &empty; &emsp; &ensp;"),
("ε ≡ η ð ë € ∃ ƒ ∀ ½", "&epsilon; &equiv; &eta; &eth; &euml; &euro; &exist; &fnof; &forall; &frac12;"),
("¼ ¾ γ ≥ > ⇔ ↔ ♥ …", "&frac14; &frac34; &frasl; &gamma; &ge; &gt; &hArr; &harr; &hearts; &hellip;"),
("í î ¡ ì ∞ ∫ ι ¿ ∈", "&iacute; &icirc; &iexcl; &igrave; &image; &infin; &int; &iota; &iquest; &isin;"),
("ï κ ⇐ λ ⟨ « ← ⌈ “ ≤", "&iuml; &kappa; &lArr; &lambda; &lang; &laquo; &larr; &lceil; &ldquo; &le;"),
("\u200e < ¯ — µ", "&lfloor; &lowast; &loz; &lrm; &lsaquo; &lsquo; &lt; &macr; &mdash; &micro;"),
("· μ ∇ \u00A0 ≠ ∋ ¬ ∉", "&middot; &minus; &mu; &nabla; &nbsp; &ndash; &ne; &ni; &not; &notin;"),
("⊄ ñ ν ó ô œ ò ‾ ω", "&nsub; &ntilde; &nu; &oacute; &ocirc; œ &ograve; &oline; &omega;"),
("ο ª º ø õ ⊗ ö", "&omicron; &oplus; &or; &ordf; &ordm; &oslash; &otilde; &otimes; &ouml;"),
("¶ ∂ ‰ ⊥ φ π ϖ ± £ ", "&para; &part; &permil; &perp; &phi; &pi; &piv; &plusmn; &pound; &prime; &prod;"),
("∝ ψ \" ⇒ √ ⟩ » → ⌉ ”", "&prop; &psi; &quot; &rArr; &radic; &rang; &raquo; &rarr; &rceil; &rdquo;"),
(" ® ⌋ ρ \u200f š", "&real; &reg; &rfloor; &rho; &rlm; &rsaquo; &rsquo; &sbquo; &scaron;"),
("⋅ § \u00AD σ ς ♠ ⊂ ⊆ ∑", "&sdot; &sect; &shy; &sigma; &sigmaf; &sim; &spades; &sub; &sube; &sum;"),
("⊃ ¹ ² ³ ⊇ ß τ ∴ θ ϑ", "&sup; &sup1; &sup2; &sup3; &supe; &szlig; &tau; &there4; &theta; &thetasym;"),
("\u2009 þ ˜ × ™ ⇑ ú ↑ û", "&thinsp; &thorn; &tilde; &times; &trade; &uArr; &uacute; &uarr; &ucirc;"),
("ù ¨ ϒ υ ü ℘ ξ ý ¥ ÿ", "&ugrave; &uml; &upsih; &upsilon; &uuml; &weierp; &xi; &yacute; &yen; &yuml;"),
("ζ \u200D \u200C + = %", "&zeta; &zwj; &zwnj; &plus; = %"),
# Набор из html.entities.html5
]
@pytest.mark.parametrize("input_string, expected_output", STRINGS_FOR_ENCODE)
def test_utf_to_html_mnemo(input_string, expected_output):
"""
Проверяет ПОВЕДЕНИЕ: кодирование Unicode-строк в HTML-мнемоники.
"""