char.def 4.02 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
#
#   Japanese charcter category map
#
#   $Id: char.def,v 1.4 2006/07/05 16:54:13 taku-ku Exp $;
#

###################################################################################
# 
#  CHARACTER CATEGORY DEFINITION
#
#  CATEGORY_NAME INVOKE GROUP LENGTH
#
#   - CATEGORY_NAME: Name of category. you have to define DEFAULT class.
#   - INVOKE: 1/0:   always invoke unknown word processing, evan when the word can be found in the lexicon
#   - GROUP:  1/0:   make a new word by grouping the same chracter category
#   - LENGTH: n:     1 to n length new words are added
#
DEFAULT	       0 1 0  # DEFAULT is a mandatory category!
SPACE	       0 1 0  
KANJI	       0 0 2
SYMBOL	       1 1 0
NUMERIC	       1 1 0
ALPHA	       1 1 0
HIRAGANA       0 1 2 
KATAKANA       1 1 2
KANJINUMERIC   1 1 0
GREEK	       1 1 0
CYRILLIC       1 1 0

###################################################################################
#
# CODE(UCS2) TO CATEGORY MAPPING
#

# SPACE
0x0020 SPACE  # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE
0x000D SPACE  # CR
0x0009 SPACE  # HT
0x000B SPACE  # VT
0x000A SPACE  # LF

# ASCII
0x0021..0x002F SYMBOL
0x0030..0x0039 NUMERIC
0x003A..0x0040 SYMBOL
0x0041..0x005A ALPHA
0x005B..0x0060 SYMBOL
0x0061..0x007A ALPHA
0x007B..0x007E SYMBOL

# Latin
0x00A1..0x00BF SYMBOL # Latin 1
0x00C0..0x00FF ALPHA  # Latin 1
0x0100..0x017F ALPHA  # Latin Extended A
0x0180..0x0236 ALPHA  # Latin Extended B
0x1E00..0x1EF9 ALPHA  # Latin Extended Additional

# CYRILLIC
0x0400..0x04F9 CYRILLIC
0x0500..0x050F CYRILLIC # Cyrillic supplementary

# GREEK
0x0374..0x03FB GREEK # Greek and Coptic

# HIRAGANA
0x3041..0x309F  HIRAGANA

# KATAKANA
0x30A1..0x30FF  KATAKANA
0x31F0..0x31FF  KATAKANA  # Small KU .. Small RO
# 0x30FC          KATAKANA HIRAGANA  # ー
0x30FC          KATAKANA

# Half KATAKANA
0xFF66..0xFF9D  KATAKANA
0xFF9E..0xFF9F  KATAKANA

# KANJI
0x2E80..0x2EF3  KANJI # CJK Raidcals Supplement
0x2F00..0x2FD5  KANJI
0x3005          KANJI
0x3007          KANJI
0x3400..0x4DB5  KANJI # CJK Unified Ideographs Extention
0x4E00..0x9FA5  KANJI
0xF900..0xFA2D  KANJI
0xFA30..0xFA6A  KANJI

# KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
0x4E00 KANJINUMERIC KANJI
0x4E8C KANJINUMERIC KANJI
0x4E09 KANJINUMERIC KANJI
0x56DB KANJINUMERIC KANJI
0x4E94 KANJINUMERIC KANJI
0x516D KANJINUMERIC KANJI
0x4E03 KANJINUMERIC KANJI
0x516B KANJINUMERIC KANJI
0x4E5D KANJINUMERIC KANJI
0x5341 KANJINUMERIC KANJI
0x767E KANJINUMERIC KANJI
0x5343 KANJINUMERIC KANJI
0x4E07 KANJINUMERIC KANJI
0x5104 KANJINUMERIC KANJI
0x5146 KANJINUMERIC KANJI

# ZENKAKU 
0xFF10..0xFF19 NUMERIC
0xFF21..0xFF3A ALPHA
0xFF41..0xFF5A ALPHA
0xFF01..0xFF0F SYMBOL
0xFF1A..0xFF1F SYMBOL
0xFF3B..0xFF40 SYMBOL 
0xFF5B..0xFF65 SYMBOL
0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form

# OTHER SYMBOLS
0x2000..0x206F  SYMBOL # General Punctuation
0x2070..0x209F  NUMERIC # Superscripts and Subscripts
0x20A0..0x20CF  SYMBOL # Currency Symbols
0x20D0..0x20FF  SYMBOL # Combining Diaritical Marks for Symbols
0x2100..0x214F  SYMBOL # Letterlike Symbols
0x2150..0x218F  NUMERIC # Number forms
0x2100..0x214B  SYMBOL # Letterlike Symbols
0x2190..0x21FF  SYMBOL # Arrow
0x2200..0x22FF  SYMBOL # Mathematical Operators
0x2300..0x23FF  SYMBOL # Miscellaneuos Technical
0x2460..0x24FF  SYMBOL # Enclosed NUMERICs
0x2501..0x257F  SYMBOL # Box Drawing
0x2580..0x259F  SYMBOL # Block Elements
0x25A0..0x25FF  SYMBOL # Geometric Shapes
0x2600..0x26FE  SYMBOL # Miscellaneous Symbols
0x2700..0x27BF  SYMBOL # Dingbats
0x27F0..0x27FF  SYMBOL # Supplemental Arrows A
0x27C0..0x27EF  SYMBOL # Miscellaneous Mathematical Symbols-A
0x2800..0x28FF  SYMBOL # Braille Patterns
0x2900..0x297F  SYMBOL # Supplemental Arrows B
0x2B00..0x2BFF  SYMBOL # Miscellaneous Symbols and Arrows
0x2A00..0x2AFF  SYMBOL # Supplemental Mathematical Operators
0x3300..0x33FF  SYMBOL
0x3200..0x32FE  SYMBOL # ENclosed CJK Letters and Months
0x3000..0x303F  SYMBOL # CJK Symbol and Punctuation
0xFE30..0xFE4F  SYMBOL # CJK Compatibility Forms
0xFE50..0xFE6B  SYMBOL # Small Form Variants

# added 2006/3/13 
0x3007 SYMBOL KANJINUMERIC

# END OF TABLE