Add combining characters to unaccent.rules.
authorThomas Munro <tmunro@postgresql.org>
Fri, 1 Feb 2019 14:23:01 +0000 (15:23 +0100)
committerThomas Munro <tmunro@postgresql.org>
Fri, 1 Feb 2019 14:23:01 +0000 (15:23 +0100)
Strip certain classes of combining characters, so that accents encoded
this way are removed.

Author: Hugh Ranalli
Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f%40postgresql.org

contrib/unaccent/expected/unaccent.out
contrib/unaccent/generate_unaccent_rules.py
contrib/unaccent/sql/unaccent.sql
contrib/unaccent/unaccent.rules

index 69c2cf9bd7ab0af4656c20de86c9f00e09f6c478..c1bd7cd897df05eb0914c015e2d71ae230655783 100644 (file)
@@ -31,6 +31,12 @@ SELECT unaccent('หƒห–ห—หœ');
  >+-~
 (1 row)
 
+SELECT unaccent('Aฬ€');  -- Remove combining diacritical 0x0300
+ unaccent 
+----------
+ A
+(1 row)
+
 SELECT unaccent('unaccent', 'foobar');
  unaccent 
 ----------
@@ -55,6 +61,12 @@ SELECT unaccent('unaccent', 'หƒห–ห—หœ');
  >+-~
 (1 row)
 
+SELECT unaccent('unaccent', 'Aฬ€');
+ unaccent 
+----------
+ A
+(1 row)
+
 SELECT ts_lexize('unaccent', 'foobar');
  ts_lexize 
 -----------
@@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', 'หƒห–ห—หœ');
  {>+-~}
 (1 row)
 
+SELECT ts_lexize('unaccent', 'Aฬ€');
+ ts_lexize 
+-----------
+ {A}
+(1 row)
+
index 4419a771edf919d20a317a9383c5f6ff3fa9b0ff..58b6e7deb74c4dc62e5cd331dd036eb996e6a7f0 100644 (file)
@@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
                        (0x03b1, 0x03c9),     # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
                        (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
 
+# Combining marks follow a "base" character, and result in a composite
+# character. Example: "U&'A\0300'"produces "Aฬ€".There are three types of
+# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
+# combining (Mc). We identify the ranges of marks we feel safe removing.
+# References:
+#   https://en.wikipedia.org/wiki/Combining_character
+#   https://www.unicode.org/charts/PDF/U0300.pdf
+#   https://www.unicode.org/charts/PDF/U20D0.pdf
+COMBINING_MARK_RANGES = ((0x0300, 0x0362),  # Mn: Accents, IPA
+                         (0x20dd, 0x20E0),  # Me: Symbols
+                         (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
+
 def print_record(codepoint, letter):
-    print (chr(codepoint) + "\t" + letter)
+    if letter:
+        output = chr(codepoint) + "\t" + letter
+    else:
+        output = chr(codepoint)
+
+    print(output)
 
 class Codepoint:
     def __init__(self, id, general_category, combining_ids):
@@ -70,6 +87,16 @@ class Codepoint:
         self.general_category = general_category
         self.combining_ids = combining_ids
 
+def is_mark_to_remove(codepoint):
+    """Return true if this is a combining mark to remove."""
+    if not is_mark(codepoint):
+        return False
+
+    for begin, end in COMBINING_MARK_RANGES:
+        if codepoint.id >= begin and codepoint.id <= end:
+            return True
+    return False
+
 def is_plain_letter(codepoint):
     """Return true if codepoint represents a "plain letter"."""
     for begin, end in PLAIN_LETTER_RANGES:
@@ -234,6 +261,8 @@ def main(args):
                              "".join(chr(combining_codepoint.id)
                                      for combining_codepoint \
                                      in get_plain_letters(codepoint, table))))
+        elif is_mark_to_remove(codepoint):
+            charactersSet.add((codepoint.id, None))
 
     # add CLDR Latin-ASCII characters
     if not args.noLigaturesExpansion:
index c671827caa55a634bfa9aa5752108433636b1a7e..2ae097ff2b86171b2255b30d44ee39bdd9f1f66c 100644 (file)
@@ -9,13 +9,16 @@ SELECT unaccent('foobar');
 SELECT unaccent('ั‘ะปะบะฐ');
 SELECT unaccent('ะะ–ะ˜ะš');
 SELECT unaccent('หƒห–ห—หœ');
+SELECT unaccent('Aฬ€');  -- Remove combining diacritical 0x0300
 
 SELECT unaccent('unaccent', 'foobar');
 SELECT unaccent('unaccent', 'ั‘ะปะบะฐ');
 SELECT unaccent('unaccent', 'ะะ–ะ˜ะš');
 SELECT unaccent('unaccent', 'หƒห–ห—หœ');
+SELECT unaccent('unaccent', 'Aฬ€');
 
 SELECT ts_lexize('unaccent', 'foobar');
 SELECT ts_lexize('unaccent', 'ั‘ะปะบะฐ');
 SELECT ts_lexize('unaccent', 'ะะ–ะ˜ะš');
 SELECT ts_lexize('unaccent', 'หƒห–ห—หœ');
+SELECT ts_lexize('unaccent', 'Aฬ€');
index 7ce25eef03d61668e0d631ed0cb1d4391fc402a9..99826408ac14560e4970a46f4e424c819cab158b 100644 (file)
 ห– +
 ห— -
 หœ ~
+ฬ€
+ฬ
+ฬ‚
+ฬƒ
+ฬ„
+ฬ…
+ฬ†
+ฬ‡
+ฬˆ
+ฬ‰
+ฬŠ
+ฬ‹
+ฬŒ
+ฬ
+ฬŽ
+ฬ
+ฬ
+ฬ‘
+ฬ’
+ฬ“
+ฬ”
+ฬ•
+ฬ–
+ฬ—
+ฬ˜
+ฬ™
+ฬš
+ฬ›
+ฬœ
+ฬ
+ฬž
+ฬŸ
+ฬ 
+ฬก
+ฬข
+ฬฃ
+ฬค
+ฬฅ
+ฬฆ
+ฬง
+ฬจ
+ฬฉ
+ฬช
+ฬซ
+ฬฌ
+ฬญ
+ฬฎ
+ฬฏ
+ฬฐ
+ฬฑ
+ฬฒ
+ฬณ
+ฬด
+ฬต
+ฬถ
+ฬท
+ฬธ
+ฬน
+ฬบ
+ฬป
+ฬผ
+ฬฝ
+ฬพ
+ฬฟ
+อ€
+อ
+อ‚
+อƒ
+อ„
+อ…
+อ†
+อ‡
+อˆ
+อ‰
+อŠ
+อ‹
+อŒ
+อ
+อŽ
+อ
+อ
+อ‘
+อ’
+อ“
+อ”
+อ•
+อ–
+อ—
+อ˜
+อ™
+อš
+อ›
+อœ
+อ
+อž
+อŸ
+อ 
+อก
+อข
 ฮ† ฮ‘
 ฮˆ ฮ•
 ฮ‰ ฮ—
 โ‚ง    Pts
 โ‚น    Rs
 โ‚บ    TL
+โƒ
+โƒž
+โƒŸ
+โƒ 
+โƒข
+โƒฃ
+โƒค
 โ„€    a/c
 โ„    a/s
 โ„‚    C