Commit 1ab1cc6a authored by Noah Levitt's avatar Noah Levitt

The source files filled with unicode information are now generated at make time.

parent 656ac498
2003-02-23 Noah Levitt <nlevitt аt columbia.edu>
* src/Makefile.am:
* src/unicode_info.[ch]:
* extact_unihan.sh:
* generate_unicode_stuff.sh:
* src/generate_unicode_stuff.sh:
* src/.cvsignore:
* src/unihan.c:
* src/unicode_data.c: The source files filled with unicode information
are now generated at make time.
2003-02-10 Noah Levitt <nlevitt аt columbia.edu>
* src/main.c: The Find and Copy buttons know when to be sensitive.
......
......@@ -31,8 +31,8 @@ $Id$
- Replace bootstrap with autogen.sh. [DONE]
- Write the shell script that automates generation of the source
files that are based on uincode.org information. Make it part of
the build process?? Maybe the autogen process...
files that are based on unicode.org information. Make it part of
the build process?? Maybe the autogen process... [DONE]
- Do something about accessibility. This might mean having the
charmap implement AtkTable as suggested by hp.
......
#!/bin/sh
#
# $Id$
#
#
# looks at Unihan.txt information and defines a unihan_t for each character
# sample output:
#
# { 0x4E00, "one; a, an; alone", "YAT1", "YI1", "qit4", "IL", "", "ICHI ITSU" },
# { 0x4E01, "male adult; robust, vigorous; the fourth heavenly stem", "DING1 JANG1", "DING1 ZHENG1", "deng1", "CENG", "", "TEI CHOU TOU" },
# { 0x4E02, "obstruction of breath (qi) as it seeks release; variant of other characters", "", "KAO3 QIAO3 YU2", "", "KYO", "", "KOU" },
# { 0x4E03, "seven", "CHAT1", "QI1 SHANG3", "tsit4", "CHIL", "", "SHICHI SHITSU" },
#
#
# sample usage: ./extract_unihan.sh < Unihan.txt
#
#
# typedef struct
# {
# gunichar index;
# gchar *kDefinition;
# gchar *kCantonese;
# gchar *kMandarin;
# gchar *kTang;
# gchar *kKorean;
# gchar *kJapaneseKun
# gchar *kJapaneseOn;
# }
# unihan_t;
#
#
# assumes there are no quotes or other characters that should be escaped in
# any of the values (which is not true, but there are only a couple
# exceptions, which can be easily fixed)
#
kDefinition=""
kCantonese=""
kMandarin=""
kTang=""
kKorean=""
kJapaneseKun=""
kJapaneseOn=""
curr_index="U+3400"
while read index property value
do
case $index in
"#") continue ;;
"#*") continue ;;
esac
if [ $index != $curr_index ]
then
hex=`echo $curr_index | sed 's/^U+/0x/'`
curr_index=$index
if [ "x$kDefinition" = "x" ] && [ "x$kCantonese" = "x" ] && [ "x$kMandarin" = "x" ] && [ "x$kTang" = "x" ] && [ "x$kKorean" = "x" ] && [ "x$kJapaneseKun" = "x" ] && [ "x$kJapaneseOn" = "x" ] ;
then
continue;
fi
echo " { $hex, \"$kDefinition\", \"$kCantonese\", \"$kMandarin\", \"$kTang\", \"$kKorean\", \"$kJapaneseKun\", \"$kJapaneseOn\" },"
kDefinition=""
kCantonese=""
kMandarin=""
kTang=""
kKorean=""
kJapaneseKun=""
kJapaneseOn=""
fi
case $property in
"kDefinition") kDefinition=$value ;;
"kCantonese") kCantonese=$value ;;
"kMandarin") kMandarin=$value ;;
"kTang") kTang=$value ;;
"kKorean") kKorean=$value ;;
"kJapaneseKun") kJapaneseKun=$value ;;
"kJapaneseOn") kJapaneseOn=$value ;;
esac
done
#!/bin/sh
#
# $Id$
#
#
# Copyright (c) 2002 Noah Levitt <nlevitt аt users.sourceforge.net>
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
#
# This script gets files from unicode.org and generates
# src/unicode_data.c and src/unihan.c
#
UNZIP=`which unzip`
WGET=`which wget`
unidir="$PWD/unicode.org"
srcdir="$PWD/src"
function die()
{
echo "error: $1"
exit 1
}
function download()
{
if [ "x$WGET" = "x" ] ; then
echo
echo "error: wget not found, can't download files"
echo
echo "You can download these files yourself and put them in $unidir:"
echo
echo "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt"
echo "http://www.unicode.org/Public/UNIDATA/Unihan.zip"
echo
fi
$WGET --directory-prefix=$unidir "http://www.unicode.org/Public/UNIDATA/$1"
}
if [ -e $unidir ] && [ ! -d $unidir ] ; then
echo "error: $unidir exists and is not a directory"
exit 1
fi
if [ ! -e $unidir ] ; then
echo "mkdir $unidir"
mkdir $unidir
fi
for f in UnicodeData.txt Unihan.zip ; do
if [ ! -e $unidir/$f ] ; then
download $f
else
echo "already have $unidir/$f, not downloading"
fi
done
if [ -e $srcdir/unicode_data.c ] ; then
/bin/cp $srcdir/unicode_data.c $srcdir/unicode_data.old
fi
/bin/cat > $srcdir/unicode_data.c <<EOF
/* unicode_data.c */
/* THIS IS A GENERATED FILE. */
#include <gtk/gtk.h>
#include <string.h>
#include "unicode_info.h"
typedef struct
{
gunichar index;
const gchar *name;
}
unicode_data_t;
static const unicode_data_t unicode_data[] =
{
EOF
/usr/bin/awk -F';' '{print " { 0x" $1 ", \"" $2 "\" },"}' \
< "$unidir/UnicodeData.txt" \
>> "$srcdir/unicode_data.c"
/bin/cat >> $srcdir/unicode_data.c <<EOF
};
/* does a binary search on unicode_data */
const gchar *
get_unicode_data_name (gunichar uc)
{
gint min = 0;
gint mid;
gint max = sizeof (unicode_data) / sizeof (unicode_data_t) - 1;
if (uc < unicode_data[0].index || uc > unicode_data[max].index)
return "";
while (max >= min)
{
mid = (min + max) / 2;
if (uc > unicode_data[mid].index)
min = mid + 1;
else if (uc < unicode_data[mid].index)
max = mid - 1;
else
return unicode_data[mid].name;
}
return NULL;
}
/* ascii case-insensitive substring search (source ripped from glib) */
static const gchar *
ascii_case_strrstr (const gchar *haystack, const gchar *needle)
{
gsize i;
gsize needle_len;
gsize haystack_len;
const gchar *p;
g_return_val_if_fail (haystack != NULL, NULL);
g_return_val_if_fail (needle != NULL, NULL);
needle_len = strlen (needle);
haystack_len = strlen (haystack);
if (needle_len == 0)
return haystack;
if (haystack_len < needle_len)
return NULL;
p = haystack + haystack_len - needle_len;
while (p >= haystack)
{
for (i = 0; i < needle_len; i++)
if (g_ascii_tolower (p[i]) != g_ascii_tolower (needle[i]))
goto next;
return p;
next:
p--;
}
return NULL;
}
/* case insensitive; returns (gunichar)(-1) if nothing found */
gunichar
find_next_substring_match (gunichar start, gunichar unichar_max,
const gchar *search_text)
{
gint min = 0;
gint mid = 0;
gint max = sizeof (unicode_data) / sizeof (unicode_data_t) - 1;
gint i0;
gint i;
/* locate the start character by binary search */
if (start < unicode_data[0].index || start > unichar_max)
i0 = 0;
else
{
while (max >= min)
{
mid = (min + max) / 2;
if (start > unicode_data[mid].index)
min = mid + 1;
else if (start < unicode_data[mid].index)
max = mid - 1;
else
break;
}
i0 = mid;
}
/* try substring match on each */
max = sizeof (unicode_data) / sizeof (unicode_data_t);
for (i = i0+1; i != i0; )
{
if (ascii_case_strrstr (unicode_data[i].name, search_text) != NULL)
return unicode_data[i].index;
i++;
if (i >= max || unicode_data[i].index > unichar_max)
i = 0;
}
/* if the start character matches we want to return a match */
if (ascii_case_strrstr (unicode_data[i].name, search_text) != NULL)
return unicode_data[i].index;
return (gunichar)(-1);
}
EOF
......@@ -3,3 +3,5 @@ Makefile.in
gucharmap
gucharmap_marshal.c
gucharmap_marshal.h
unicode_data.cI
unicode_unihan.cI
......@@ -29,11 +29,19 @@ lib_LTLIBRARIES = libgucharmap.la
libgucharmap_la_SOURCES = gucharmap_marshal.c gucharmap_marshal.h \
charmap.c charmap.h \
unicode_data.c unicode_info.c \
unicode_info.h unihan.c \
unicode_data.cI unicode_unihan.cI \
unicode_info.c unicode_info.h \
gucharmap_intl.c gucharmap_intl.h \
mini_fontsel.c mini_fontsel.h
unicode_data.cI:
./generate_unicode_stuff.sh unicode_data.cI || exit 1
unicode_unihan.cI:
./generate_unicode_stuff.sh unicode_unihan.cI || exit 1
# this is different from the project version
# http://sources.redhat.com/autobook/autobook/autobook_91.html
libgucharmap_la_LDFLAGS = -version-info 1:0:0
......@@ -47,6 +55,7 @@ bin_PROGRAMS = gucharmap
gucharmap_SOURCES = main.c
if OS_WIN32
gucharmap_LDADD = gucharmaprc.o libgucharmap.la @GTK_LIBS@ @POPT_LIBS@
......@@ -99,4 +108,5 @@ uninstall-local:
endif
EXTRA_DIST = gucharmap_marshal.list gucharmaprc.rc
EXTRA_DIST = gucharmap_marshal.list gucharmaprc.rc unicode_data.cI \
unicode_unihan.cI
#!/bin/sh
#
# $Id$
#
#
# Copyright (c) 2002 Noah Levitt <nlevitt аt users.sourceforge.net>
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
#
# This script gets files from unicode.org and generates
# unicode_data.cI and unihan.cI
#
UNZIP=`which unzip`
WGET=`which wget`
SED=`which sed`
MV=`which mv`
ECHO=`which echo`
MKDIR=`which mkdir`
AWK=`which awk`
unidir="$PWD/unicode.org"
srcdir="$PWD"
function download()
{
if [ "x$WGET" = "x" ] ; then
$ECHO
$ECHO "error: wget not found, can't download files"
$ECHO
$ECHO "You can download these files yourself and put them in $unidir:"
$ECHO
$ECHO "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt"
$ECHO "http://www.unicode.org/Public/UNIDATA/Unihan.zip"
$ECHO
exit 1
fi
$WGET --directory-prefix=$unidir \
"http://www.unicode.org/Public/UNIDATA/$1" || exit 1
}
# reads from stdin, writes to stdout
function write_unicode_data()
{
$ECHO "/* unicode_data.cI */"
$ECHO "/* THIS IS A GENERATED FILE. */"
$ECHO "/* http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */"
$ECHO ""
$ECHO "const UnicodeData unicode_data[] ="
$ECHO "{"
$AWK -F';' '{print " { 0x" $1 ", \"" $2 "\" },"}'
$ECHO "};"
}
function stringify()
{
if [ "x$1" = "x" ] ; then
echo 0
else
echo "\"$1\""
fi
}
# reads from stdin, writes to stdout
function write_unihan()
{
$ECHO "/* unicode_unihan.cI */"
$ECHO "/* THIS IS A GENERATED FILE. */"
$ECHO "/* http://www.unicode.org/Public/UNIDATA/Unihan.zip */"
$ECHO ""
$ECHO ""
$ECHO "#if HAVE_CONFIG_H"
$ECHO "# include <config.h>"
$ECHO "#endif"
$ECHO "#include <unicode_info.h>"
$ECHO ""
$ECHO ""
$ECHO "#if ENABLE_UNIHAN"
$ECHO ""
$ECHO "const Unihan unihan[] ="
$ECHO "{"
unset kDefinition kCantonese kMandarin kTang
unset kKorean kJapaneseKun kJapaneseOn
curr_index="U+3400"
while read index property value
do
case $index in
"#") continue ;;
"#*") continue ;;
esac
if [ "x$index" != "x$curr_index" ]
then
hex=`$ECHO $curr_index | $SED 's/^U+/0x/'`
curr_index=$index
if [ "x$kDefinition" = "x" ] && [ "x$kCantonese" = "x" ] && [ "x$kMandarin" = "x" ] && [ "x$kTang" = "x" ] && [ "x$kKorean" = "x" ] && [ "x$kJapaneseKun" = "x" ] && [ "x$kJapaneseOn" = "x" ] ;
then
continue;
fi
QQkDefinition=`stringify "$kDefinition"`
QQkCantonese=`stringify "$kCantonese"`
QQkMandarin=`stringify "$kMandarin"`
QQkTang=`stringify "$kTang"`
QQkKorean=`stringify "$kKorean"`
QQkJapaneseKun=`stringify "$kJapaneseKun"`
QQkJapaneseOn=`stringify "$kJapaneseOn"`
$ECHO " { $hex, $QQkDefinition, $QQkCantonese, $QQkMandarin, $QQkTang, $QQkKorean, $QQkJapaneseKun, $QQkJapaneseOn },"
unset kDefinition kCantonese kMandarin kTang
unset kKorean kJapaneseKun kJapaneseOn
fi
case $property in
"kDefinition") kDefinition=`$ECHO $value | $SED 's/\"/\\\"/g'` ;;
"kCantonese") kCantonese=`$ECHO $value | $SED 's/\"/\\\"/g'` ;;
"kMandarin") kMandarin=`$ECHO $value | $SED 's/\"/\\\"/g'` ;;
"kTang") kTang=`$ECHO $value | $SED 's/\"/\\\"/g'` ;;
"kKorean") kKorean=`$ECHO $value | $SED 's/\"/\\\"/g'` ;;
"kJapaneseKun") kJapaneseKun=`$ECHO $value | $SED 's/\"/\\\"/g'` ;;
"kJapaneseOn") kJapaneseOn=`$ECHO $value | $SED 's/\"/\\\"/g'` ;;
esac
done
$ECHO "};"
$ECHO ""
$ECHO ""
$ECHO "#endif /* #if ENABLE_UNIHAN */"
$ECHO ""
}
function backup()
{
if [ -e $1 ] ; then
$ECHO "backing up existing $1 to $1.old"
$MV -f $1 $1.old
fi
}
function make_download_dir()
{
if [ -e $unidir ] && [ ! -d $unidir ] ; then
$ECHO "error: $unidir exists and is not a directory"
exit 1
fi
if [ ! -e $unidir ] ; then
$ECHO "creating directory $unidir"
$MKDIR $unidir
fi
}
function do_unicode_data()
{
make_download_dir
f="$unidir/UnicodeData.txt"
if [ ! -e $f ] ; then
download "UnicodeData.txt"
else
$ECHO "already have $f, not downloading"
fi
out=$srcdir/unicode_data.cI
backup $out
$ECHO -n "writing $out..."
write_unicode_data < $unidir/UnicodeData.txt > $out
$ECHO "done"
}
function do_unihan()
{
if [ "x$UNZIP" = "x" ] ; then
$ECHO
$ECHO "error: unzip not found, can't unzip Unihan.zip"
$ECHO
exit 1
fi
make_download_dir
f="$unidir/Unihan.zip"
if [ ! -e $f ] ; then
download "Unihan.zip"
else
$ECHO "already have $f, not downloading"
fi
out=$srcdir/unicode_unihan.cI
backup $out
$ECHO -n "writing $out (this will take a long time)..."
$UNZIP -c $unidir/Unihan.zip | write_unihan > $out
$ECHO "done"
}
# end of functions
# this is where the program starts
case "x$1" in
"xunicode_data.cI") do_unicode_data ;;
"xunicode_unihan.cI") do_unihan ;;
*)
echo "usage: $0 FILE_TO_GENERATE"
echo " where FILE_TO_GENERATE is unicode_data.cI or unicode_unihan.cI"
exit 1
esac
This diff is collapsed.
......@@ -17,8 +17,18 @@
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#if HAVE_CONFIG_H
# include <config.h>
#endif
#include <gtk/gtk.h>
#include "unicode_info.h"
#include <unicode_info.h>
#include <unicode_data.cI>
#if ENABLE_UNIHAN
# include <unicode_unihan.cI>
#endif
/* constants for hangul (de)composition, see UAX #15 */
#define SBase 0xAC00
......@@ -329,3 +339,276 @@ unicode_canonical_decomposition (gunichar ch, gsize *result_len)
return g_unicode_canonical_decomposition (ch, result_len);
}
/* does a binary search on unicode_data */
const gchar *
get_unicode_data_name (gunichar uc)
{
gint min = 0;
gint mid;
gint max = sizeof (unicode_data) / sizeof (UnicodeData) - 1;
if (uc < unicode_data[0].index || uc > unicode_data[max].index)
return "";
while (max >= min)
{
mid = (min + max) / 2;
if (uc > unicode_data[mid].index)
min = mid + 1;
else if (uc < unicode_data[mid].index)
max = mid - 1;
else
return unicode_data[mid].name;
}
return NULL;
}
/* ascii case-insensitive substring search (source ripped from glib) */
static const gchar *
ascii_case_strrstr (const gchar *haystack, const gchar *needle)
{
gsize i;
gsize needle_len;
gsize haystack_len;
const gchar *p;
g_return_val_if_fail (haystack != NULL, NULL);
g_return_val_if_fail (needle != NULL, NULL);
needle_len = strlen (needle);
haystack_len = strlen (haystack);
if (needle_len == 0)
return haystack;
if (haystack_len < needle_len)
return NULL;
p = haystack + haystack_len - needle_len;
while (p >= haystack)
{
for (i = 0; i < needle_len; i++)
if (g_ascii_tolower (p[i]) != g_ascii_tolower (needle[i]))
goto next;
return p;
next:
p--;
}
return NULL;
}
/* case insensitive; returns (gunichar)(-1) if nothing found */
gunichar
find_next_substring_match (gunichar start, gunichar unichar_max,
const gchar *search_text)
{
gint min = 0;
gint mid = 0;
gint max = sizeof (unicode_data) / sizeof (UnicodeData) - 1;
gint i0;
gint i;
/* locate the start character by binary search */
if (start < unicode_data[0].index || start > unichar_max)
i0 = 0;
else
{
while (max >= min)
{
mid = (min + max) / 2;
if (start > unicode_data[mid].index)
min = mid + 1;
else if (start < unicode_data[mid].index)
max = mid - 1;
else
break;
}
i0 = mid;
}
/* try substring match on each */
max = sizeof (unicode_data) / sizeof (UnicodeData);
for (i = i0+1; i != i0; )
{
if (ascii_case_strrstr (unicode_data[i].name, search_text) != NULL)
return unicode_data[i].index;