gucharmap-unicode-info.c 18.4 KB
Newer Older
1
/*
Christian Persch's avatar
Christian Persch committed
2
 * Copyright © 2004 Noah Levitt
3 4 5
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
Christian Persch's avatar
Christian Persch committed
6
 * Free Software Foundation; either version 3 of the License, or (at your
7 8 9 10 11 12 13 14 15
 * option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
16
 * 59 Temple Place, Suite 330, Boston, MA 02110-1301  USA
17 18
 */

Christian Persch's avatar
Christian Persch committed
19
#include <config.h>
20

21
#include <gtk/gtk.h>
22
#include <string.h>
23

24 25
#include <glib/gi18n-lib.h>

26
#include "gucharmap.h"
27
#include "gucharmap-private.h"
28

Noah Levitt's avatar
Noah Levitt committed
29 30 31 32
#include "unicode-names.h"
#include "unicode-blocks.h"
#include "unicode-nameslist.h"
#include "unicode-categories.h"
33
#include "unicode-versions.h"
34
#include "unicode-unihan.h"
Noah Levitt's avatar
Noah Levitt committed
35

36 37 38 39 40 41 42 43
/* constants for hangul (de)composition, see UAX #15 */
#define SBase 0xAC00
#define LCount 19
#define VCount 21
#define TCount 28
#define NCount (VCount * TCount)
#define SCount (LCount * NCount)

44
static const gchar JAMO_L_TABLE[][4] = {
45
  "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
46
  "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
47 48
};

49
static const gchar JAMO_V_TABLE[][4] = {
50 51 52 53 54
  "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
  "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
  "YU", "EU", "YI", "I"
};

55
static const gchar JAMO_T_TABLE[][4] = {
56 57 58
  "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
  "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
  "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
59 60
};

61
const gchar *
Noah Levitt's avatar
Noah Levitt committed
62
gucharmap_get_unicode_name (gunichar wc)
63
{
64
  static gchar buf[32];
65

66 67
  _gucharmap_intl_ensure_initialized ();

68 69 70
  if ((wc >= 0x3400 && wc <= 0x4db5)
      || (wc >= 0x4e00 && wc <= 0x9fd5)
      || (wc >= 0x20000 && wc <= 0x2a6d6)
Christian Persch's avatar
Christian Persch committed
71
      || (wc >= 0x2a700 && wc <= 0x2b734)
72 73
      || (wc >= 0x2b740 && wc <= 0x2b81d)
      || (wc >= 0x2b820 && wc <= 0x2cea1))
Noah Levitt's avatar
Noah Levitt committed
74 75 76 77
    {
      g_snprintf (buf, sizeof (buf), "CJK UNIFIED IDEOGRAPH-%04X", wc);
      return buf;
    }
78 79 80 81 82
  else if ((wc >= 0xf900 && wc <= 0xfaff) ||
           (wc >= 0x2f800 && wc <= 0x2fa1d)) {
      g_snprintf (buf, sizeof (buf), "CJK COMPATIBILITY IDEOGRAPH-%04X", wc);
      return buf;
  }
83 84 85 86 87
  else if (wc >= 0x17000 && wc <= 0x187ec) {
      g_snprintf (buf, sizeof (buf), "TANGUT IDEOGRAPH-%05X", wc);
      return buf;
  }
  else if (wc >= 0x18800 && wc <= 0x18af2) {
88
      g_snprintf (buf, sizeof (buf), "TANGUT COMPONENT-%03u", wc - 0x18800 + 1);
89 90
      return buf;
  }
Noah Levitt's avatar
Noah Levitt committed
91 92 93 94 95
  else if (wc >= 0xac00 && wc <= 0xd7af)
    {
      /* compute hangul syllable name as per UAX #15 */
      gint SIndex = wc - SBase;
      gint LIndex, VIndex, TIndex;
96

Noah Levitt's avatar
Noah Levitt committed
97 98
      if (SIndex < 0 || SIndex >= SCount)
        return "";
99

Noah Levitt's avatar
Noah Levitt committed
100 101 102
      LIndex = SIndex / NCount;
      VIndex = (SIndex % NCount) / TCount;
      TIndex = SIndex % TCount;
103

Noah Levitt's avatar
Noah Levitt committed
104 105
      g_snprintf (buf, sizeof (buf), "HANGUL SYLLABLE %s%s%s", 
                  JAMO_L_TABLE[LIndex], JAMO_V_TABLE[VIndex], JAMO_T_TABLE[TIndex]);
106

Noah Levitt's avatar
Noah Levitt committed
107 108 109
      return buf;
    }
  else if (wc >= 0xD800 && wc <= 0xDB7F) 
110
    return _("<Non Private Use High Surrogate>");
Noah Levitt's avatar
Noah Levitt committed
111
  else if (wc >= 0xDB80 && wc <= 0xDBFF) 
112
    return _("<Private Use High Surrogate>");
Noah Levitt's avatar
Noah Levitt committed
113 114 115
  else if (wc >= 0xDC00 && wc <= 0xDFFF)
    return _("<Low Surrogate>");
  else if (wc >= 0xE000 && wc <= 0xF8FF) 
116
    return _("<Private Use>");
Noah Levitt's avatar
Noah Levitt committed
117
  else if (wc >= 0xF0000 && wc <= 0xFFFFD)
118
    return _("<Plane 15 Private Use>");
Noah Levitt's avatar
Noah Levitt committed
119
  else if (wc >= 0x100000 && wc <= 0x10FFFD)
120
    return _("<Plane 16 Private Use>");
121
  else
122
    {
Noah Levitt's avatar
Noah Levitt committed
123
      const gchar *x = gucharmap_get_unicode_data_name (wc);
124
      if (x == NULL)
125
        return _("<not assigned>");
126 127 128
      else
        return x;
    }
129 130
}

131
const gchar *
Noah Levitt's avatar
Noah Levitt committed
132
gucharmap_get_unicode_category_name (gunichar wc)
133
{
134 135
  _gucharmap_intl_ensure_initialized ();

Noah Levitt's avatar
Noah Levitt committed
136
  switch (gucharmap_unichar_type (wc))
137
    {
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
      case G_UNICODE_CONTROL: return _("Other, Control");
      case G_UNICODE_FORMAT: return _("Other, Format");
      case G_UNICODE_UNASSIGNED: return _("Other, Not Assigned");
      case G_UNICODE_PRIVATE_USE: return _("Other, Private Use");
      case G_UNICODE_SURROGATE: return _("Other, Surrogate");
      case G_UNICODE_LOWERCASE_LETTER: return _("Letter, Lowercase");
      case G_UNICODE_MODIFIER_LETTER: return _("Letter, Modifier");
      case G_UNICODE_OTHER_LETTER: return _("Letter, Other");
      case G_UNICODE_TITLECASE_LETTER: return _("Letter, Titlecase");
      case G_UNICODE_UPPERCASE_LETTER: return _("Letter, Uppercase");
      case G_UNICODE_COMBINING_MARK: return _("Mark, Spacing Combining");
      case G_UNICODE_ENCLOSING_MARK: return _("Mark, Enclosing");
      case G_UNICODE_NON_SPACING_MARK: return _("Mark, Non-Spacing");
      case G_UNICODE_DECIMAL_NUMBER: return _("Number, Decimal Digit");
      case G_UNICODE_LETTER_NUMBER: return _("Number, Letter");
      case G_UNICODE_OTHER_NUMBER: return _("Number, Other");
      case G_UNICODE_CONNECT_PUNCTUATION: return _("Punctuation, Connector");
      case G_UNICODE_DASH_PUNCTUATION: return _("Punctuation, Dash");
      case G_UNICODE_CLOSE_PUNCTUATION: return _("Punctuation, Close");
      case G_UNICODE_FINAL_PUNCTUATION: return _("Punctuation, Final Quote");
      case G_UNICODE_INITIAL_PUNCTUATION: return _("Punctuation, Initial Quote");
      case G_UNICODE_OTHER_PUNCTUATION: return _("Punctuation, Other");
      case G_UNICODE_OPEN_PUNCTUATION: return _("Punctuation, Open");
      case G_UNICODE_CURRENCY_SYMBOL: return _("Symbol, Currency");
      case G_UNICODE_MODIFIER_SYMBOL: return _("Symbol, Modifier");
      case G_UNICODE_MATH_SYMBOL: return _("Symbol, Math");
      case G_UNICODE_OTHER_SYMBOL: return _("Symbol, Other");
      case G_UNICODE_LINE_SEPARATOR: return _("Separator, Line");
      case G_UNICODE_PARAGRAPH_SEPARATOR: return _("Separator, Paragraph");
      case G_UNICODE_SPACE_SEPARATOR: return _("Separator, Space");
168 169
      default: return "";
    }
170
}
171

Noah Levitt's avatar
Noah Levitt committed
172
/* does a binary search on unicode_names */
173
const gchar *
174
gucharmap_get_unicode_data_name (gunichar uc)
175 176 177
{
  gint min = 0;
  gint mid;
178
  gint max = G_N_ELEMENTS(unicode_names) - 1;
179

Noah Levitt's avatar
Noah Levitt committed
180
  if (uc < unicode_names[0].index || uc > unicode_names[max].index)
181 182 183 184 185
    return "";

  while (max >= min) 
    {
      mid = (min + max) / 2;
Noah Levitt's avatar
Noah Levitt committed
186
      if (uc > unicode_names[mid].index)
187
        min = mid + 1;
Noah Levitt's avatar
Noah Levitt committed
188
      else if (uc < unicode_names[mid].index)
189 190
        max = mid - 1;
      else
191
        return unicode_name_get_name(&unicode_names[mid]);
192 193 194 195 196
    }

  return NULL;
}

Noah Levitt's avatar
Noah Levitt committed
197
gint
198
gucharmap_get_unicode_data_name_count (void)
Noah Levitt's avatar
Noah Levitt committed
199 200 201 202
{
  return G_N_ELEMENTS (unicode_names);
}

203 204 205 206 207 208 209 210 211
/* does a binary search on unicode_versions */
GucharmapUnicodeVersion
gucharmap_get_unicode_version (gunichar uc)
{
  gint min = 0;
  gint mid;
  gint max = G_N_ELEMENTS (unicode_versions) - 1;

  if (uc < unicode_versions[0].start || uc > unicode_versions[max].end)
212
    return GUCHARMAP_UNICODE_VERSION_UNASSIGNED;
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228

  while (max >= min)
    {
      mid = (min + max) / 2;

      if (uc > unicode_versions[mid].end)
        min = mid + 1;
      else if (uc < unicode_versions[mid].start)
        max = mid - 1;
      else if ((uc >= unicode_versions[mid].start) && (uc <= unicode_versions[mid].end))
        return unicode_versions[mid].version;
    }

  return GUCHARMAP_UNICODE_VERSION_UNASSIGNED;
}

229
const gchar *
230 231 232 233 234
gucharmap_unicode_version_to_string (GucharmapUnicodeVersion version)
{
  g_return_val_if_fail (version >= GUCHARMAP_UNICODE_VERSION_UNASSIGNED &&
                        version <= GUCHARMAP_UNICODE_VERSION_LATEST, NULL);

235 236 237 238
  if (G_UNLIKELY (version == GUCHARMAP_UNICODE_VERSION_UNASSIGNED))
    return NULL;

  return unicode_version_strings + unicode_version_string_offsets[version - 1];
239 240
}

Noah Levitt's avatar
Noah Levitt committed
241
gint
242
gucharmap_get_unihan_count (void)
Noah Levitt's avatar
Noah Levitt committed
243 244 245 246
{
  return G_N_ELEMENTS (unihan);
}

247 248
/* does a binary search; also caches most recent, since it will often be
 * called in succession on the same character */
249
static const Unihan *
250 251 252 253 254 255
_get_unihan (gunichar uc)
{
  static gunichar most_recent_searched;
  static const Unihan *most_recent_result;
  gint min = 0;
  gint mid;
256 257
  gint max = G_N_ELEMENTS(unihan) - 1;

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284

  if (uc < unihan[0].index || uc > unihan[max].index)
    return NULL;

  if (uc == most_recent_searched)
    return most_recent_result;

  most_recent_searched = uc;

  while (max >= min) 
    {
      mid = (min + max) / 2;
      if (uc > unihan[mid].index)
        min = mid + 1;
      else if (uc < unihan[mid].index)
        max = mid - 1;
      else
        {
          most_recent_result = unihan + mid;
          return unihan + mid;
        }
    }

  most_recent_result = NULL;
  return NULL;
}

285 286
/* does a binary search; also caches most recent, since it will often be
 * called in succession on the same character */
287
static const NamesList *
288 289 290 291 292 293
get_nameslist (gunichar uc)
{
  static gunichar most_recent_searched;
  static const NamesList *most_recent_result;
  gint min = 0;
  gint mid;
294
  gint max = G_N_ELEMENTS (names_list) - 1;
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321

  if (uc < names_list[0].index || uc > names_list[max].index)
    return NULL;

  if (uc == most_recent_searched)
    return most_recent_result;

  most_recent_searched = uc;

  while (max >= min) 
    {
      mid = (min + max) / 2;
      if (uc > names_list[mid].index)
        min = mid + 1;
      else if (uc < names_list[mid].index)
        max = mid - 1;
      else
        {
          most_recent_result = names_list + mid;
          return names_list + mid;
        }
    }

  most_recent_result = NULL;
  return NULL;
}

322
G_GNUC_INTERNAL gboolean
Noah Levitt's avatar
Noah Levitt committed
323 324 325 326 327
_gucharmap_unicode_has_nameslist_entry (gunichar uc)
{
  return get_nameslist (uc) != NULL;
}

328 329
/* returns newly allocated array of gunichar terminated with -1 */
gunichar *
330
gucharmap_get_nameslist_exes (gunichar uc)
331 332 333
{
  const NamesList *nl;
  gunichar *exes;
334
  gunichar i, count;
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
  
  nl = get_nameslist (uc);

  if (nl == NULL || nl->exes_index == -1)
    return NULL;

  /* count the number of exes */
  for (i = 0;  names_list_exes[nl->exes_index + i].index == uc;  i++);
  count = i;

  exes = g_malloc ((count + 1) * sizeof (gunichar));
  for (i = 0;  i < count;  i++)
    exes[i] = names_list_exes[nl->exes_index + i].value;
  exes[count] = (gunichar)(-1);

  return exes;
}

353 354 355 356 357 358 359
/**
 * gucharmap_get_nameslist_equals:
 * @uc: a gunichar
 *
 * Returns: (transfer container): newly allocated null-terminated array of gchar*
 * the items are const, but the array should be freed by the caller
 */
360
const gchar **
361
gucharmap_get_nameslist_equals (gunichar uc)
362 363 364
{
  const NamesList *nl;
  const gchar **equals;
365 366
  gunichar i, count;

367 368 369 370 371 372 373 374 375 376 377
  nl = get_nameslist (uc);

  if (nl == NULL || nl->equals_index == -1)
    return NULL;

  /* count the number of equals */
  for (i = 0;  names_list_equals[nl->equals_index + i].index == uc;  i++);
  count = i;

  equals = g_malloc ((count + 1) * sizeof (gchar *));
  for (i = 0;  i < count;  i++)
378
    equals[i] = names_list_equals_strings + names_list_equals[nl->equals_index + i].string_index;
379 380 381 382 383
  equals[count] = NULL;

  return equals;
}

384 385 386 387 388 389 390
/**
 * gucharmap_get_nameslist_stars:
 * @uc: a #gunichar
 *
 * Returns: (transfer container): newly allocated null-terminated array of gchar*
 * the items are const, but the array should be freed by the caller
 */
391
const gchar **
392
gucharmap_get_nameslist_stars (gunichar uc)
393 394 395
{
  const NamesList *nl;
  const gchar **stars;
396
  gunichar i, count;
397 398 399 400 401 402 403 404 405 406 407 408

  nl = get_nameslist (uc);

  if (nl == NULL || nl->stars_index == -1)
    return NULL;

  /* count the number of stars */
  for (i = 0;  names_list_stars[nl->stars_index + i].index == uc;  i++);
  count = i;

  stars = g_malloc ((count + 1) * sizeof (gchar *));
  for (i = 0;  i < count;  i++)
409
    stars[i] = names_list_stars_strings + names_list_stars[nl->stars_index + i].string_index;
410 411 412 413 414
  stars[count] = NULL;

  return stars;
}

415 416 417 418 419 420 421
/**
 * gucharmap_get_nameslist_pounds:
 * @uc: a #gunichar
 *
 * Returns: (transfer container): newly allocated null-terminated array of gchar*
 * the items are const, but the array should be freed by the caller
 */
422
const gchar **
423
gucharmap_get_nameslist_pounds (gunichar uc)
424 425 426
{
  const NamesList *nl;
  const gchar **pounds;
427
  gunichar i, count;
428 429 430 431 432 433 434 435 436 437 438 439
  
  nl = get_nameslist (uc);

  if (nl == NULL || nl->pounds_index == -1)
    return NULL;

  /* count the number of pounds */
  for (i = 0;  names_list_pounds[nl->pounds_index + i].index == uc;  i++);
  count = i;

  pounds = g_malloc ((count + 1) * sizeof (gchar *));
  for (i = 0;  i < count;  i++)
440
    pounds[i] = names_list_pounds_strings + names_list_pounds[nl->pounds_index + i].string_index;
441 442 443 444 445
  pounds[count] = NULL;

  return pounds;
}

446 447 448 449 450 451 452
/**
 * gucharmap_get_nameslist_colons:
 * @uc: a #gunichar
 *
 * Returns: (transfer container): newly allocated null-terminated array of gchar*
 * the items are const, but the array should be freed by the caller
 */
453
const gchar **
454
gucharmap_get_nameslist_colons (gunichar uc)
455 456 457
{
  const NamesList *nl;
  const gchar **colons;
458 459
  gunichar i, count;

460 461 462 463 464 465 466 467 468 469 470
  nl = get_nameslist (uc);

  if (nl == NULL || nl->colons_index == -1)
    return NULL;

  /* count the number of colons */
  for (i = 0;  names_list_colons[nl->colons_index + i].index == uc;  i++);
  count = i;

  colons = g_malloc ((count + 1) * sizeof (gchar *));
  for (i = 0;  i < count;  i++)
471
    colons[i] = names_list_colons_strings + names_list_colons[nl->colons_index + i].string_index;
472 473 474 475 476
  colons[count] = NULL;

  return colons;
}

477
/* Wrapper, in case we want to support a newer unicode version than glib */
478
gboolean
479
gucharmap_unichar_validate (gunichar ch)
480
{
481
  return g_unichar_validate (ch);
482 483 484
}

/**
485
 * gucharmap_unichar_to_printable_utf8:
486 487 488 489 490 491 492 493 494 495 496 497
 * @uc: a unicode character 
 * @outbuf: output buffer, must have at least 10 bytes of space.
 *          If %NULL, the length will be computed and returned
 *          and nothing will be written to @outbuf.
 *
 * Converts a single character to UTF-8 suitable for rendering. Check the
 * source to see what this means. ;-)
 * 
 *
 * Return value: number of bytes written
 **/
gint
498
gucharmap_unichar_to_printable_utf8 (gunichar uc, gchar *outbuf)
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
{
  /* Unicode Standard 3.2, section 2.6, "By convention, diacritical marks
   * used by the Unicode Standard may be exhibited in (apparent) isolation
   * by applying them to U+0020 SPACE or to U+00A0 NO BREAK SPACE." */

  /* 17:10 < owen> noah: I'm *not* claiming that what Pango does currently
   *               is right, but convention isn't a requirement. I think
   *               it's probably better to do the Uniscribe thing and put
   *               the lone combining mark on a dummy character and require
   *               ZWJ
   * 17:11 < noah> owen: do you mean that i should put a ZWJ in there, or
   *               that pango will do that?
   * 17:11 < owen> noah: I mean, you should (assuming some future more
   *               capable version of Pango) put it in there
   */

515 516
  if (! gucharmap_unichar_validate (uc) || (! gucharmap_unichar_isgraph (uc) 
      && gucharmap_unichar_type (uc) != G_UNICODE_PRIVATE_USE))
517
    return 0;
518 519 520
  else if (gucharmap_unichar_type (uc) == G_UNICODE_COMBINING_MARK
      || gucharmap_unichar_type (uc) == G_UNICODE_ENCLOSING_MARK
      || gucharmap_unichar_type (uc) == G_UNICODE_NON_SPACING_MARK)
521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537
    {
      gint x;

      outbuf[0] = ' ';
      outbuf[1] = '\xe2'; /* ZERO */ 
      outbuf[2] = '\x80'; /* WIDTH */
      outbuf[3] = '\x8d'; /* JOINER (0x200D) */

      x = g_unichar_to_utf8 (uc, outbuf + 4);

      return x + 4;
    }
  else
    return g_unichar_to_utf8 (uc, outbuf);
}

/**
538
 * gucharmap_unichar_type:
539
 * @uc: a Unicode character
540 541 542 543 544 545
 * 
 * Classifies a Unicode character by type.
 * 
 * Return value: the type of the character.
 **/
GUnicodeType
546
gucharmap_unichar_type (gunichar uc)
547 548 549 550 551
{
  gint min = 0;
  gint mid;
  gint max = sizeof (unicode_categories) / sizeof (UnicodeCategory) - 1;

Noah Levitt's avatar
Noah Levitt committed
552
  if (uc < unicode_categories[0].start || uc > unicode_categories[max].end)
553 554 555 556 557
    return G_UNICODE_UNASSIGNED;

  while (max >= min) 
    {
      mid = (min + max) / 2;
Noah Levitt's avatar
Noah Levitt committed
558
      if (uc > unicode_categories[mid].end)
559
        min = mid + 1;
Noah Levitt's avatar
Noah Levitt committed
560
      else if (uc < unicode_categories[mid].start)
561 562 563 564 565 566 567 568 569
        max = mid - 1;
      else
        return unicode_categories[mid].category;
    }

  return G_UNICODE_UNASSIGNED;
}

/**
570
 * gucharmap_unichar_isdefined:
571 572 573 574 575 576 577 578
 * @uc: a Unicode character
 * 
 * Determines if a given character is assigned in the Unicode
 * standard.
 *
 * Return value: %TRUE if the character has an assigned value
 **/
gboolean
579
gucharmap_unichar_isdefined (gunichar uc)
580
{
581
  return gucharmap_unichar_type (uc) != G_UNICODE_UNASSIGNED;
582 583 584
}

/**
585
 * gucharmap_unichar_isgraph:
586 587 588 589 590 591 592 593 594 595 596
 * @uc: a Unicode character
 * 
 * Determines whether a character is printable and not a space
 * (returns %FALSE for control characters, format characters, and
 * spaces). g_unichar_isprint() is similar, but returns %TRUE for
 * spaces. Given some UTF-8 text, obtain a character value with
 * g_utf8_get_char().
 * 
 * Return value: %TRUE if @c is printable unless it's a space
 **/
gboolean
597
gucharmap_unichar_isgraph (gunichar uc)
598
{
599
  GUnicodeType t = gucharmap_unichar_type (uc);
600

601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617
  /* From http://www.unicode.org/versions/Unicode9.0.0/ch09.pdf, p16
   * "Unlike most other format control characters, however, they should be
   *  rendered with a visible glyph, even in circumstances where no suitable
   *  digit or sequence of digits follows them in logical order."
   * There the standard talks about the ar signs spanning numbers, but
   * I think this should apply to all Prepended_Concatenation_Mark format
   * characters.
   * Instead of parsing the corresponding data file, just hardcode the
   * (few!) existing characters here.
   */
  if (t == G_UNICODE_FORMAT)
    return (uc >= 0x0600 && uc <= 0x0605) || 
	   uc == 0x06DD ||
           uc == 0x070F ||
           uc == 0x08E2 ||
           uc == 0x110BD;

618 619 620 621 622 623
  return (t != G_UNICODE_CONTROL
          && t != G_UNICODE_UNASSIGNED
          && t != G_UNICODE_PRIVATE_USE
          && t != G_UNICODE_SURROGATE
          && t != G_UNICODE_SPACE_SEPARATOR);
}
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661

static gunichar
get_first_non_underscore_char (const char *str)
{
  const char *p;

  if (!str)
    return 0;

  for (p = str; p && *p; p = g_utf8_find_next_char (p, NULL))
    {
      gunichar ch;

      ch = g_utf8_get_char (p);
      if (g_unichar_isalpha (ch))
        return ch;
    }

  return 0;
}

/**
 * gucharmap_unicode_get_locale_character:
 *
 * Determines a character that's commonly used in the current
 * locale's script.
 * 
 * Returns: a unicode character
 */
gunichar
gucharmap_unicode_get_locale_character (void)
{
  GtkStockItem item;
  if (!gtk_stock_lookup (GTK_STOCK_FIND, &item))
    return 0;

  return get_first_non_underscore_char (item.label);
}