Commit 1e5248bd authored by Noah Levitt's avatar Noah Levitt

2003-12-14 Noah Levitt

	* gucharmap/gen-guch-unicode-tables.pl: Some fixes.

	* gucharmap/unicode-blocks.h:
	* gucharmap/unicode-categories.h:
	* gucharmap/unicode-names.h:
	* gucharmap/unicode-nameslist.h:
	* gucharmap/unicode-unihan.h: New generated headers.

	* gucharmap/unicode/.cvsignore:
	* gucharmap/unicode/Makefile.am:
	* gucharmap/unicode/generate_categories.pl:
	* gucharmap/unicode/generate_nameslist.c:
	* gucharmap/unicode/generate_unihan.c:
	* gucharmap/unicode/generator.sh:
	* gucharmap/unicode/unicode_blocks.cI:
	* gucharmap/unicode/unicode_categories.cI:
	* gucharmap/unicode/unicode_data.cI:
	* gucharmap/unicode/unicode_nameslist.cI:
	* gucharmap/unicode/unicode_unihan.cI: Remove in favor of new ones.

	* configure.ac:
	* gucharmap/Makefile.am:
	* gucharmap/gucharmap-unicode-info.c: Change to the use new unicode
	tables.
parent 9aacaf20
2003-12-14 Noah Levitt
* gucharmap/gen-guch-unicode-tables.pl: Some fixes.
* gucharmap/unicode-blocks.h:
* gucharmap/unicode-categories.h:
* gucharmap/unicode-names.h:
* gucharmap/unicode-nameslist.h:
* gucharmap/unicode-unihan.h: New generated headers.
* gucharmap/unicode/.cvsignore:
* gucharmap/unicode/Makefile.am:
* gucharmap/unicode/generate_categories.pl:
* gucharmap/unicode/generate_nameslist.c:
* gucharmap/unicode/generate_unihan.c:
* gucharmap/unicode/generator.sh:
* gucharmap/unicode/unicode_blocks.cI:
* gucharmap/unicode/unicode_categories.cI:
* gucharmap/unicode/unicode_data.cI:
* gucharmap/unicode/unicode_nameslist.cI:
* gucharmap/unicode/unicode_unihan.cI: Remove in favor of new ones.
* configure.ac:
* gucharmap/Makefile.am:
* gucharmap/gucharmap-unicode-info.c: Change to the use new unicode
tables.
2003-12-14 Noah Levitt
* gucharmap/gen-guch-unicode-tables.pl: New unicode table generation
......
......@@ -143,7 +143,6 @@ AC_CONFIG_FILES([
gucharmap.pc
Makefile
gucharmap/Makefile
gucharmap/unicode/Makefile
po/Makefile.in
pixmaps/Makefile
help/Makefile
......
......@@ -148,10 +148,3 @@ endif
EXTRA_DIST = gucharmap_marshal.list gucharmaprc.rc
if MAINTAINER_MODE
SUBDIRS = unicode
endif
......@@ -14,10 +14,11 @@
#
use strict;
use vars '$UNZIP';
use vars ('$UNZIP', '$ICONV');
# if unzip isn't in your path you can put the path to it here
$UNZIP = 'unzip';
# if these things aren't in your path you can put full paths to them here
$UNZIP = 'unzip';
$ICONV = 'iconv';
sub process_unicode_data_txt ($);
sub process_unihan_zip ($);
......@@ -74,7 +75,9 @@ sub process_unicode_data_txt ($)
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
print $out "/* Generated by $0 */\n\n";
print $out "#include <glib/gtypes.h>\n\n";
print $out "#include <glib/gunicode.h>\n\n";
print $out "typedef struct _UnicodeName UnicodeName;\n\n";
print $out "static const struct _UnicodeName\n";
print $out "{\n";
......@@ -155,6 +158,8 @@ sub process_unicode_data_txt ($)
print $out "#include <glib/gunicode.h>\n\n";
print $out "typedef struct _UnicodeCategory UnicodeCategory;\n\n";
print $out "static const struct _UnicodeCategory\n";
print $out "{\n";
print $out " gunichar first;\n";
......@@ -217,11 +222,13 @@ sub process_unihan_zip ($)
print "processing $unihan_zip";
print $out "/* unicode-names.h */\n";
print $out "/* unicode-unihan.h */\n";
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
print $out "/* Generated by $0 */\n\n";
print $out "#include <glib/gtypes.h>\n\n";
print $out "#include <glib/gunicode.h>\n\n";
print $out "typedef struct _Unihan Unihan;\n\n";
print $out "static const struct _Unihan\n";
print $out "{\n";
......@@ -341,7 +348,7 @@ sub process_nameslist_txt ($)
{
my ($nameslist_txt) = @_;
open (my $nameslist, $nameslist_txt) or die;
open (my $nameslist, "$ICONV -f 'ISO8859-1' -t 'UTF-8' $nameslist_txt |") or die;
print "processing $nameslist_txt...";
......@@ -431,7 +438,11 @@ sub process_nameslist_txt ($)
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
print $out "/* Generated by $0 */\n\n";
print $out "#include <glib/gtypes.h>\n\n";
print $out "#include <glib/gunicode.h>\n\n";
print $out "typedef struct _UnicharString UnicharString;\n";
print $out "typedef struct _UnicharUnichar UnicharUnichar;\n";
print $out "typedef struct _NamesList NamesList;\n\n";
print $out "struct _UnicharString\n";
print $out "{\n";
......@@ -545,23 +556,22 @@ sub process_blocks_txt ($)
print "processing $blocks_txt...";
print $out "/* unicode-nameslist.h */\n";
print $out "/* unicode-blocks.h */\n";
print $out "/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */\n";
print $out "/* Generated by $0 */\n\n";
print $out "const static struct _UnicodeBlock\n";
print $out "{\n";
print $out " gunichar start;\n";
print $out " gunichar end;\n";
print $out " const gchar *name;\n";
print $out "}\n";
print $out "unicode_blocks[] = \n";
print $out "#include <glib/gunicode.h>\n";
print $out "#include <gucharmap/gucharmap-unicode-info.h>\n";
print $out "#include <gucharmap/gucharmap_intl.h>\n\n";
print $out "const GucharmapUnicodeBlock gucharmap_unicode_blocks[] = \n";
print $out "{\n";
while (my $line = <$blocks>)
{
$line =~ /^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$/ or next;
print $out qq/ { 0x$1, 0x$2, N_("$3") },\n/;
}
print $out " { (gunichar)(-1), (gunichar)(-1), NULL }\n";
print $out "};\n\n";
close ($blocks);
......
......@@ -17,6 +17,14 @@
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "unicode-names.h"
#include "unicode-blocks.h"
#include "unicode-nameslist.h"
#include "unicode-categories.h"
#if ENABLE_UNIHAN
# include "unicode-unihan.h"
#endif
#if HAVE_CONFIG_H
# include <config.h>
#endif
......@@ -26,74 +34,6 @@
#include <gucharmap_intl.h>
#include <gucharmap/gucharmap-unicode-info.h>
typedef struct
{
gunichar index;
const gchar *name;
}
UnicodeData;
typedef struct
{
gunichar index;
const gchar *kDefinition;
const gchar *kCantonese;
const gchar *kMandarin;
const gchar *kTang;
const gchar *kKorean;
const gchar *kJapeneseKun;
const gchar *kJapaneseOn;
}
Unihan;
typedef struct
{
gunichar index;
gchar *value;
}
UnicharString;
typedef struct
{
gunichar index;
gunichar value;
}
UnicharUnichar;
typedef struct
{
gunichar index;
gint equals_index; /* -1 means */
gint stars_index; /* this character */
gint exes_index; /* doesn't */
gint pounds_index; /* have any */
gint colons_index;
}
NamesList;
typedef struct
{
gunichar first;
gunichar last;
GUnicodeType category;
}
UnicodeCategory;
#include "unicode/unicode_data.cI"
#include "unicode/unicode_blocks.cI"
#if ENABLE_UNIHAN
# include "unicode/unicode_unihan.cI"
#endif
#include "unicode/unicode_nameslist.cI"
#include "unicode/unicode_categories.cI"
/* constants for hangul (de)composition, see UAX #15 */
#define SBase 0xAC00
#define LBase 0x1100
......@@ -290,26 +230,26 @@ gucharmap_unicode_canonical_decomposition (gunichar ch,
/* does a binary search on unicode_data */
/* does a binary search on unicode_names */
G_CONST_RETURN gchar *
gucharmap_get_unicode_data_name (gunichar uc)
{
gint min = 0;
gint mid;
gint max = sizeof (unicode_data) / sizeof (UnicodeData) - 1;
gint max = sizeof (unicode_names) / sizeof (UnicodeName) - 1;
if (uc < unicode_data[0].index || uc > unicode_data[max].index)
if (uc < unicode_names[0].index || uc > unicode_names[max].index)
return "";
while (max >= min)
{
mid = (min + max) / 2;
if (uc > unicode_data[mid].index)
if (uc > unicode_names[mid].index)
min = mid + 1;
else if (uc < unicode_data[mid].index)
else if (uc < unicode_names[mid].index)
max = mid - 1;
else
return unicode_data[mid].name;
return unicode_names[mid].name;
}
return NULL;
......@@ -362,14 +302,14 @@ gucharmap_find_substring_match (gunichar start,
const gchar *search_text,
gint direction)
{
gint max = sizeof (unicode_data) / sizeof (UnicodeData) - 1;
gint max = sizeof (unicode_names) / sizeof (UnicodeName) - 1;
gint i0;
gint i;
g_assert (direction == -1 || direction == 1);
/* locate the start character by binary search */
if (start < unicode_data[0].index || start > UNICHAR_MAX)
if (start < unicode_names[0].index || start > UNICHAR_MAX)
i0 = 0;
else
{
......@@ -378,9 +318,9 @@ gucharmap_find_substring_match (gunichar start,
while (max >= min)
{
mid = (min + max) / 2;
if (start > unicode_data[mid].index)
if (start > unicode_names[mid].index)
min = mid + 1;
else if (start < unicode_data[mid].index)
else if (start < unicode_names[mid].index)
max = mid - 1;
else
break;
......@@ -389,18 +329,18 @@ gucharmap_find_substring_match (gunichar start,
i0 = mid;
}
max = sizeof (unicode_data) / sizeof (UnicodeData) - 1;
max = sizeof (unicode_names) / sizeof (UnicodeName) - 1;
/* try substring match on each */
for (i = i0 + direction; i != i0; )
{
if (unicode_data[i].index > UNICHAR_MAX)
if (unicode_names[i].index > UNICHAR_MAX)
{
i += direction;
continue;
}
if (ascii_case_strrstr (unicode_data[i].name, search_text) != NULL)
return unicode_data[i].index;
if (ascii_case_strrstr (unicode_names[i].name, search_text) != NULL)
return unicode_names[i].index;
i += direction;
if (i > max)
......@@ -410,8 +350,8 @@ gucharmap_find_substring_match (gunichar start,
}
/* if the start character matches we want to return a match */
if (ascii_case_strrstr (unicode_data[i].name, search_text) != NULL)
return unicode_data[i].index;
if (ascii_case_strrstr (unicode_names[i].name, search_text) != NULL)
return unicode_names[i].index;
return (gunichar)(-1);
}
......
/* unicode_blocks.cI */
/* THIS IS A GENERATED FILE. */
/* http://www.unicode.org/Public/UNIDATA/Blocks.txt */
/* unicode-blocks.h */
/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */
/* Generated by ./gen-guch-unicode-tables.pl */
const GucharmapUnicodeBlock gucharmap_unicode_blocks[] =
#include <glib/gunicode.h>
#include <gucharmap/gucharmap-unicode-info.h>
#include <gucharmap/gucharmap_intl.h>
const GucharmapUnicodeBlock gucharmap_unicode_blocks[] =
{
{ 0x0000, 0x007F, N_("Basic Latin") },
{ 0x0080, 0x00FF, N_("Latin-1 Supplement") },
......@@ -131,3 +135,4 @@ const GucharmapUnicodeBlock gucharmap_unicode_blocks[] =
{ 0x100000, 0x10FFFF, N_("Supplementary Private Use Area-B") },
{ (gunichar)(-1), (gunichar)(-1), NULL }
};
/* unicode_categories.cI */
/* THIS IS A GENERATED FILE. */
/* http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
/* unicode-categories.h */
/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */
/* Generated by ./gen-guch-unicode-tables.pl */
const UnicodeCategory unicode_categories[] =
#include <glib/gunicode.h>
typedef struct _UnicodeCategory UnicodeCategory;
static const struct _UnicodeCategory
{
gunichar first;
gunichar last;
GUnicodeType category;
}
unicode_categories[] =
{
{ 0x0000, 0x001F, G_UNICODE_CONTROL },
{ 0x0020, 0x0020, G_UNICODE_SPACE_SEPARATOR },
......
/* unicode_data.cI */
/* THIS IS A GENERATED FILE. */
/* http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
/* unicode-names.h */
/* THIS IS A GENERATED FILE. CHANGES WILL BE OVERWRITTEN. */
/* Generated by ./gen-guch-unicode-tables.pl */
const UnicodeData unicode_data[] =
#include <glib/gunicode.h>
typedef struct _UnicodeName UnicodeName;
static const struct _UnicodeName
{
gunichar index;
const gchar *name;
}
unicode_names[] =
{
{ 0x0000, "<control>" },
{ 0x0001, "<control>" },
......@@ -15105,3 +15114,4 @@ const UnicodeData unicode_data[] =
{ 0x100000, "<Plane 16 Private Use, First>" },
{ 0x10FFFD, "<Plane 16 Private Use, Last>" },
};
Makefile
Makefile.in
unicode_data.cI
unicode_unihan.cI
unicode_blocks.cI
unicode_nameslist.cI
generate_unihan
generate_nameslist
unicode.org
## $Id$
##
## Copyright (c) 2003 Noah Levitt <nlevitt аt columbia.edu>
##
## This program is free software; you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by the
## Free Software Foundation; either version 2 of the License, or (at your
## option) any later version.
##
## This program is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##
BUILT_SOURCES = unicode_data.cI unicode_unihan.cI unicode_blocks.cI \
unicode_nameslist.cI unicode_categories.cI
unicode_unihan.cI: generate_unihan
$(srcdir)/generator.sh unicode_unihan.cI
unicode_data.cI:
$(srcdir)/generator.sh unicode_data.cI
unicode_blocks.cI:
$(srcdir)/generator.sh unicode_blocks.cI
unicode_nameslist.cI: generate_nameslist
$(srcdir)/generator.sh unicode_nameslist.cI
unicode_categories.cI:
$(srcdir)/generator.sh unicode_categories.cI
noinst_PROGRAMS = generate_unihan generate_nameslist
generate_unihan_SOURCES = generate_unihan.c
generate_nameslist_SOURCES = generate_nameslist.c
CLEANFILES = unicode_unihan.cI.old unicode_data.cI.old \
unicode_blocks.cI.old unicode_nameslist.cI.old \
unicode_categories.cI.old
MAINTAINERCLEANFILES = unicode_unihan.cI unicode_data.cI unicode_blocks.cI \
unicode_nameslist.cI unicode_categories.cI
EXTRA_DIST = generate_nameslist.c generate_unihan.c \
generator.sh unicode_data.cI unicode_unihan.cI \
unicode_blocks.cI unicode_nameslist.cI \
unicode_categories.cI
#!/usr/bin/perl -w
#
# reads UnicodeData.txt on stdin, prints unicode_categories.cI on stdout
#
# Sticks ranges of codepoints together. In UnicodeData.txt, a character
# name in '<>' means the start or end of a range. This script recognizes
# these ranges. In all other cases, it sticks together only runs of
# adjacent codepoints with the same category.
use strict;
# Map general category code onto symbolic name.
my %mappings =
(
# Normative.
'Lu' => "G_UNICODE_UPPERCASE_LETTER",
'Ll' => "G_UNICODE_LOWERCASE_LETTER",
'Lt' => "G_UNICODE_TITLECASE_LETTER",
'Mn' => "G_UNICODE_NON_SPACING_MARK",
'Mc' => "G_UNICODE_COMBINING_MARK",
'Me' => "G_UNICODE_ENCLOSING_MARK",
'Nd' => "G_UNICODE_DECIMAL_NUMBER",
'Nl' => "G_UNICODE_LETTER_NUMBER",
'No' => "G_UNICODE_OTHER_NUMBER",
'Zs' => "G_UNICODE_SPACE_SEPARATOR",
'Zl' => "G_UNICODE_LINE_SEPARATOR",
'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR",
'Cc' => "G_UNICODE_CONTROL",
'Cf' => "G_UNICODE_FORMAT",
'Cs' => "G_UNICODE_SURROGATE",
'Co' => "G_UNICODE_PRIVATE_USE",
'Cn' => "G_UNICODE_UNASSIGNED",
# Informative.
'Lm' => "G_UNICODE_MODIFIER_LETTER",
'Lo' => "G_UNICODE_OTHER_LETTER",
'Pc' => "G_UNICODE_CONNECT_PUNCTUATION",
'Pd' => "G_UNICODE_DASH_PUNCTUATION",
'Ps' => "G_UNICODE_OPEN_PUNCTUATION",
'Pe' => "G_UNICODE_CLOSE_PUNCTUATION",
'Pi' => "G_UNICODE_INITIAL_PUNCTUATION",
'Pf' => "G_UNICODE_FINAL_PUNCTUATION",
'Po' => "G_UNICODE_OTHER_PUNCTUATION",
'Sm' => "G_UNICODE_MATH_SYMBOL",
'Sc' => "G_UNICODE_CURRENCY_SYMBOL",
'Sk' => "G_UNICODE_MODIFIER_SYMBOL",
'So' => "G_UNICODE_OTHER_SYMBOL"
);
print "/* unicode_categories.cI */\n";
print "/* THIS IS A GENERATED FILE. */\n";
print "/* http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */\n";
print "\n";
print "const UnicodeCategory unicode_categories[] =\n";
print "{\n";
my ($codepoint, $last_codepoint, $start_codepoint) = (-99, -99, -99);
my ($category, $last_category) = ("G_XXX", "G_YYY");
my $name;
my ($started_range, $finished_range) = (undef, undef);
while (my $line = <>)
{
$line =~ /^([0-9A-F]*);([^;]*);([^;]*);/;
my $codepoint = hex $1;
my $name = $2;
my $category = $mappings{$3};
if ($finished_range or ($category ne $last_category)
or (not $started_range and $codepoint != $last_codepoint + 1))
{
if ($last_codepoint >= 0)
{
printf(" { 0x%4.4X, 0x%4.4X, %s },\n",
$start_codepoint, $last_codepoint, $last_category);
}
$start_codepoint = $codepoint;
}
if ($name =~ /^<.*First>$/) {
$started_range = 1;
$finished_range = undef;
}
elsif ($name =~ /^<.*Last>$/) {
$started_range = undef;
$finished_range = 1;
}
elsif ($finished_range) {
$finished_range = undef;
}
$last_codepoint = $codepoint;
$last_category = $category;
}
printf(" { 0x%4.4X, 0x%4.4X, %s },\n",
$start_codepoint, $last_codepoint, $last_category);
print "};\n\n";
exit;
/* $Id$ */
/*
* Copyright (c) 2003 Noah Levitt <nlevitt аt users.sourceforge.net>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* reads NameList.txt (converted to UTF-8) from stdin, prints
* unicode_unihan.cI on stdout */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void *
xmalloc (size_t size)
{
register void *p;
p = malloc(size);
if (p)
return p;
fprintf (stderr, "malloc(%u) failed\n", (unsigned) size);
exit (EXIT_FAILURE);
}
void *
xrealloc (void *ptr, size_t size)
{
register void *p;
p = realloc (ptr, size);
if (p)
return p;
fprintf (stderr, "realloc (0x%p, %u) failed\n", ptr, (unsigned) size);
exit (EXIT_FAILURE);
}
/* gets characters up to and including newline */
size_t
getline (FILE *fin, char **bufptr, size_t *bsize)
{
size_t i;
char c, *buf;
if (feof (fin) || bsize == 0 || *bsize == 0
|| bufptr == 0 || *bufptr == 0)
return (size_t) 0;
buf = *bufptr;
i = 0;
for (c = getc (fin); !feof (fin); c = getc (fin))
{
buf[i] = c;
i++;
if (i >= *bsize - 1)
{
*bsize *= 2;
buf = xrealloc (buf, (*bsize) * sizeof (char));
*bufptr = buf;
}
if (c == '\n')
break;
}
buf[i] = '\0';
return i;
}
/* returns newly allocated string which should be freed by caller */
char *
quote (char *str)
{
char *buf;
int i, j;
/* malloc the absolute max space */
buf = xmalloc (2 * strlen (str) + 8);
buf[0] = '"';
for (i=0, j=1; str[i]; i++)
{
if (str[i] == '"')
{
buf[j] = '\\';
buf[j+1] = '"';
j += 2;
}
else
{
buf[j] = str[i];
j++;
}
}
buf[j] = '"';
buf[j+1] = '\0';
return buf;
}
void
process_nameslist_txt (FILE *fin)
{
char *line, *temp;
size_t bsize, len;
int equal_i=0, ex_i=0, star_i=0, pound_i=0, colon_i=0;
int equal0_i=-1, ex0_i=-1, star0_i=-1, pound0_i=-1, colon0_i=-1;
FILE *equal_file, *ex_file, *star_file, *pound_file, *colon_file, *main_file;
unsigned uc = 0, ucv;
char c;
equal_file = tmpfile ();
ex_file = tmpfile ();
star_file = tmpfile ();
pound_file = tmpfile ();
colon_file = tmpfile ();
main_file = tmpfile ();
bsize = 32;
line = xmalloc (bsize);
for (len = getline (fin, &line, &bsize); len > 0;
len = getline (fin, &line, &bsize))
{
line[len-1] = '\0';
if