Commit 65ff1324 authored by Graham Inggs's avatar Graham Inggs

Imported Upstream version 2.2.0

parent 07148425
cmake_minimum_required (VERSION 2.8)
cmake_minimum_required (VERSION 2.8.12)
include (utils.cmake)
......@@ -10,12 +10,8 @@ project (utf8proc C)
# API version number (defined in utf8proc.h).
# Be sure to also update these in Makefile and MANIFEST!
set(SO_MAJOR 2)
set(SO_MINOR 1)
set(SO_PATCH 1)
add_definitions (
-DUTF8PROC_EXPORTS
)
set(SO_MINOR 2)
set(SO_PATCH 0)
if (NOT MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -std=c99 -pedantic -Wall")
......@@ -26,8 +22,30 @@ add_library (utf8proc
utf8proc.h
)
if (BUILD_SHARED_LIBS)
# Building shared library
else()
# Building static library
target_compile_definitions(utf8proc PUBLIC "UTF8PROC_STATIC")
if (MSVC)
set_target_properties(utf8proc PROPERTIES OUTPUT_NAME "utf8proc_static")
endif()
endif()
target_compile_definitions(utf8proc PRIVATE "UTF8PROC_EXPORTS")
set_target_properties (utf8proc PROPERTIES
POSITION_INDEPENDENT_CODE ON
VERSION "${SO_MAJOR}.${SO_MINOR}.${SO_PATCH}"
SOVERSION ${SO_MAJOR}
)
install(TARGETS utf8proc
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)
install(
FILES
"${PROJECT_SOURCE_DIR}/utf8proc.h"
DESTINATION include)
......@@ -7,7 +7,7 @@ whose copyright and license statements are reproduced below, all new
work on the utf8proc library is licensed under the [MIT "expat"
license](http://opensource.org/licenses/MIT):
*Copyright © 2014-2015 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.*
*Copyright © 2014-2018 by Steven G. Johnson, Jiahao Chen, Tony Kelman, Jonas Fonseca, and other contributors listed in the git history.*
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
......
......@@ -2,6 +2,6 @@ include/
include/utf8proc.h
lib/
lib/libutf8proc.a
lib/libutf8proc.so -> libutf8proc.so.2.1.1
lib/libutf8proc.so.2 -> libutf8proc.so.2.1.1
lib/libutf8proc.so.2.1.1
lib/libutf8proc.so -> libutf8proc.so.2.2.0
lib/libutf8proc.so.2 -> libutf8proc.so.2.2.0
lib/libutf8proc.so.2.2.0
......@@ -11,7 +11,7 @@ CFLAGS ?= -O2
PICFLAG = -fPIC
C99FLAG = -std=c99
WCFLAGS = -Wall -pedantic
UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES)
# shared-library version MAJOR.MINOR.PATCH ... this may be *different*
# from the utf8proc version number because it indicates ABI compatibility,
......@@ -20,8 +20,8 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
# The API version number is defined in utf8proc.h.
# Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
MAJOR=2
MINOR=1
PATCH=1
MINOR=2
PATCH=0
OS := $(shell uname)
ifeq ($(OS),Darwin) # MacOS X
......
# utf8proc release history #
## Version 2.2 ##
2018-07-24
- Unicode 11 support ([#132] and [#140]).
- `utf8proc_NFKC_Casefold` convenience function for `NFKC_Casefold`
normalization ([#133]).
- `UTF8PROC_STRIPNA` option to strip unassigned codepoints ([#133]).
- Support building static libraries on Windows (callers need to
`#define UTF8PROC_STATIC`) ([#123]).
- `cmake` fix to avoid defining `UTF8PROC_EXPORTS` globally ([#121]).
- `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
case-folding still yields the standard "ss" mapping.
- `utf8proc_charwidth` now returns `1` for U+00AD (soft hyphen) and
for unassigned/PUA codepoints ([#135]).
## Version 2.1.1 ##
2018-04-27
......@@ -312,5 +334,12 @@ Release of version 1.0.1
[#94]: https://github.com/JuliaLang/utf8proc/issues/94
[#99]: https://github.com/JuliaLang/utf8proc/issues/99
[#113]: https://github.com/JuliaLang/utf8proc/issues/113
[#121]: https://github.com/JuliaLang/utf8proc/issues/121
[#123]: https://github.com/JuliaLang/utf8proc/issues/123
[#125]: https://github.com/JuliaLang/utf8proc/issues/125
[#128]: https://github.com/JuliaLang/utf8proc/issues/128
[#132]: https://github.com/JuliaLang/utf8proc/issues/132
[#133]: https://github.com/JuliaLang/utf8proc/issues/133
[#134]: https://github.com/JuliaLang/utf8proc/issues/134
[#135]: https://github.com/JuliaLang/utf8proc/issues/135
[#140]: https://github.com/JuliaLang/utf8proc/issues/140
# utf8proc
[![Travis CI Status](https://travis-ci.org/JuliaLang/utf8proc.png)](https://travis-ci.org/JuliaLang/utf8proc)
[![AppVeyor Status](https://ci.appveyor.com/api/projects/status/aou20lfkyhj8xbwq/branch/master?svg=true)](https://ci.appveyor.com/project/tkelman/utf8proc/branch/master)
[![Travis CI Status](https://travis-ci.org/JuliaStrings/utf8proc.png)](https://travis-ci.org/JuliaStrings/utf8proc)
[![AppVeyor status](https://ci.appveyor.com/api/projects/status/ivaa0v6ikxrmm5r6?svg=true)](https://ci.appveyor.com/project/StevenGJohnson/utf8proc)
[utf8proc](http://julialang.org/utf8proc/) is a small, clean C
library that provides Unicode normalization, case-folding, and other
......@@ -40,7 +39,7 @@ The C library is found in this directory after successful compilation
and is named `libutf8proc.a` (for the static library) and
`libutf8proc.so` (for the dynamic library).
The Unicode version supported is 9.0.0.
The Unicode version supported is 11.0.0.
For Unicode normalizations, the following options are used:
......
......@@ -16,11 +16,11 @@ CURLFLAGS = --retry 5 --location
.DELETE_ON_ERROR:
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
$(RUBY) data_generator.rb < UnicodeData.txt > $@
# GNU Unifont version for font metric calculations:
UNIFONT_VERSION=9.0.04
UNIFONT_VERSION=11.0.01
unifont.ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
......@@ -35,7 +35,7 @@ CharWidths.txt: charwidths.jl unifont.sfd unifont_upper.sfd EastAsianWidth.txt
$(JULIA) charwidths.jl > $@
# Unicode data version
UNICODE_VERSION=9.0.0
UNICODE_VERSION=11.0.0
UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
......@@ -61,6 +61,9 @@ NormalizationTest.txt:
GraphemeBreakTest.txt:
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
emoji-data.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://unicode.org/Public/emoji/`echo $(UNICODE_VERSION) | cut -d. -f1-2`/emoji-data.txt
clean:
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd emoji-data.txt
rm -f utf8proc_data.c.new
......@@ -7,17 +7,6 @@
# Requires Julia (obviously) and FontForge.
#############################################################################
# Julia 0.3/0.4 compatibility (taken from Compat package)
if VERSION < v"0.4.0-dev+1387"
typealias AbstractString String
end
if VERSION < v"0.4.0-dev+1419"
const UInt32 = Uint32
end
if VERSION < v"0.4.0-dev+3874"
Base.parse{T<:Integer}(::Type{T}, s::AbstractString) = parseint(T, s)
end
CharWidths = Dict{Int,Int}()
#############################################################################
......@@ -31,12 +20,12 @@ import Base.UTF8proc
#############################################################################
# Use a default width of 1 for all character categories that are
# letter/symbol/number-like. This can be overriden by Unifont or UAX 11
# letter/symbol/number-like, as well as for unassigned/private-use chars.
# This can be overriden by Unifont or UAX 11
# below, but provides a useful nonzero fallback for new codepoints when
# a new Unicode version has been released but Unifont hasn't been updated yet.
zerowidth = Set{Int}() # categories that may contain zero-width chars
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
......@@ -47,7 +36,6 @@ push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO)
for c in 0x0000:0x110000
if catcode(c) zerowidth
CharWidths[c] = 1
......@@ -97,7 +85,7 @@ CharWidths=parsesfd("unifont_upper.sfd", CharWidths)
for line in readlines(open("EastAsianWidth.txt"))
#Strip comments
line[1] == '#' && continue
(isempty(line) || line[1] == '#') && continue
precomment = split(line, '#')[1]
#Parse code point range and width code
tokens = split(precomment, ';')
......@@ -113,7 +101,7 @@ for line in readlines(open("EastAsianWidth.txt"))
for c in charstart:charend
if width=="W" || width=="F" # wide or full
CharWidths[c]=2
elseif width=="Na"|| width=="H" # narrow or half
elseif width=="Na"|| width=="H"
CharWidths[c]=1
end
end
......@@ -126,9 +114,11 @@ end
for c in keys(CharWidths)
cat = catcode(c)
# make sure format control character (category Cf) have width 0,
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c [0x0601,0x0602,0x0603,0x06dd]
# make sure format control character (category Cf) have width 0
# (some of these, like U+0601, can have a width in some cases
# but normally act like prepended combining marks. U+fff9 etc
# are also odd, but have zero width in typical terminal contexts)
if cat==UTF8proc.UTF8PROC_CATEGORY_CF
CharWidths[c]=0
end
......@@ -139,11 +129,12 @@ for c in keys(CharWidths)
CharWidths[c]=0
end
# We also assign width of zero to unassigned and private-use
# We also assign width of one to unassigned and private-use
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
# but since these are nonstandard it seems questionable to recognize them).
# but since these are nonstandard it seems questionable to use Unifont metrics;
# if they are printed as the replacement character U+FFFD they will have width 1).
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
CharWidths[c]=0
CharWidths[c]=1
end
# for some reason, Unifont has width-2 glyphs for ASCII control chars
......@@ -152,6 +143,9 @@ for c in keys(CharWidths)
end
end
#Soft hyphen is typically printed as a hyphen (-) in terminals.
CharWidths[0x00ad]=1
#By definition, should have zero width (on the same line)
#0x002028 '
' category: Zl name: LINE SEPARATOR/
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
......@@ -169,8 +163,8 @@ CharWidths[0x2001]=2
CharWidths[0x2003]=2
#############################################################################
# Output (to a file or pipe) for processing by data_generator.rb
# ... don't bother to output zero widths since that will be the default.
# Output (to a file or pipe) for processing by data_generator.rb,
# encoded as a sequence of intervals.
firstc = 0x000000
lastv = 0
......
......@@ -6,6 +6,8 @@
# production use.
# Copyright (c) 2018 Steven G. Johnson, Tony Kelman, Keno Fischer,
# Benito van der Zander, Michaël Meyer, and other contributors.
# Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
#
# Permission is hereby granted, free of charge, to any person obtaining a
......@@ -85,6 +87,19 @@ $grapheme_boundclass_list.each_line do |entry|
end
end
$emoji_data_list = File.read("emoji-data.txt")
$emoji_data_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC"
elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" }
elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND"
end
end
$charwidth_list = File.read("CharWidths.txt")
$charwidth = Hash.new(0)
$charwidth_list.each_line do |entry|
......@@ -104,7 +119,7 @@ $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read
$case_folding = {}
$case_folding_string.chomp.split("\n").each do |line|
next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i
next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i
$case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
end
......@@ -137,13 +152,13 @@ def cpary2utf16encoded(array)
end
def cpary2c(array)
return "UINT16_MAX" if array.nil? || array.length == 0
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
array = cpary2utf16encoded(array)
if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
array = [lencode] + array
array = [lencode] + array
lencode = 7
end
idx = pushary(array)
end
idx = pushary(array)
raise "Array index out of bound" if idx > 0x1FFF
return "#{idx | (lencode << 13)}"
end
......@@ -188,9 +203,10 @@ class UnicodeChar
@decomp_mapping = ($8=='') ? nil :
$8.split.collect { |element| element.hex }
@bidi_mirrored = ($13=='Y') ? true : false
@uppercase_mapping = ($16=='') ? nil : $16.hex
# issue #130: use nonstandard uppercase ß -> ẞ
@uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
@lowercase_mapping = ($17=='') ? nil : $17.hex
@titlecase_mapping = ($18=='') ? nil : $18.hex
@titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
end
def case_folding
$case_folding[code]
......@@ -260,17 +276,17 @@ chars.each do |char|
end
unless comb2nd_indicies[dm1]
comb2nd_indicies_sorted_keys << dm1
comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
end
comb_array[comb1st_indicies[dm0]] ||= []
raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
end
char.c_decomp_mapping = cpary2c(char.decomp_mapping)
char.c_case_folding = cpary2c(char.case_folding)
end
end
comb_indicies = {}
cumoffset = 0
......@@ -281,7 +297,7 @@ comb1st_indicies.each do |dm0, index|
last = nil
offset = 0
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
if comb_array[index][b]
if comb_array[index][b]
first = offset unless first
last = offset
last += 1 if comb2nd_indicies_nonbasic[dm1]
......@@ -377,7 +393,7 @@ end
$stdout << "};\n\n"
$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
properties.each { |line|
$stdout << line
}
......@@ -391,7 +407,7 @@ comb1st_indicies.keys.each_index do |a|
offset = 0
$stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
break if offset > comb1st_indicies_lastoffsets[a]
break if offset > comb1st_indicies_lastoffsets[a]
if offset >= comb1st_indicies_firstoffsets[a]
i += 1
if i == 8
......@@ -403,9 +419,8 @@ comb1st_indicies.keys.each_index do |a|
$stdout << (v & 0xFFFF) << ", "
end
offset += 1
offset += 1 if comb2nd_indicies_nonbasic[dm1]
offset += 1 if comb2nd_indicies_nonbasic[dm1]
end
$stdout << "\n"
end
$stdout << "};\n\n"
......@@ -13,13 +13,22 @@ int main(int argc, char **argv)
for (c = 0; c <= 0x110000; ++c) {
utf8proc_int32_t l = utf8proc_tolower(c);
utf8proc_int32_t u = utf8proc_toupper(c);
utf8proc_int32_t t = utf8proc_totitle(c);
check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
check(t == c || utf8proc_codepoint_valid(t), "invalid totitle");
if (utf8proc_codepoint_valid(c) && (l == u) != (l == t) &&
/* Unicode 11: Georgian Mkhedruli chars have uppercase but no titlecase. */
!(((c >= 0x10d0 && c <= 0x10fa) || c >= (0x10fd && c <= 0x10ff)) && l != u)) {
fprintf(stderr, "unexpected titlecase %x for lowercase %x / uppercase %x\n", t, l, c);
++error;
}
if (sizeof(wint_t) > 2 || c < (1<<16)) {
wint_t l0 = towlower(c), u0 = towupper(c);
/* OS unicode tables may be out of date. But if they
do have a lower/uppercase mapping, hopefully it
is correct? */
......@@ -44,6 +53,20 @@ int main(int argc, char **argv)
}
}
check(!error, "utf8proc case conversion FAILED %d tests.", error);
/* issue #130 */
check(utf8proc_toupper(0x00df) == 0x1e9e &&
utf8proc_totitle(0x00df) == 0x1e9e &&
utf8proc_tolower(0x00df) == 0x00df &&
utf8proc_tolower(0x1e9e) == 0x00df &&
utf8proc_toupper(0x1e9e) == 0x1e9e,
"incorrect 0x00df/0x1e9e case conversions");
utf8proc_uint8_t str_00df[] = {0xc3, 0x9f, 0x00};
utf8proc_uint8_t str_1e9e[] = {0xe1, 0xba, 0x9e, 0x00};
check(!strcmp((char*)utf8proc_NFKC_Casefold(str_00df), "ss") &&
!strcmp((char*)utf8proc_NFKC_Casefold(str_1e9e), "ss"),
"incorrect 0x00df/0x1e9e casefold normalization");
printf("More up-to-date than OS unicode tables for %d tests.\n", better);
printf("utf8proc case conversion tests SUCCEEDED.\n");
return 0;
......
......@@ -2,70 +2,76 @@
#include <ctype.h>
#include <wchar.h>
static int my_unassigned(int c) {
int cat = utf8proc_get_property(c)->category;
return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
}
static int my_isprint(int c) {
int cat = utf8proc_get_property(c)->category;
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd);
int cat = utf8proc_get_property(c)->category;
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) ||
(cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
}
int main(int argc, char **argv)
{
int c, error = 0, updates = 0;
int c, error = 0, updates = 0;
(void) argc; /* unused */
(void) argv; /* unused */
(void) argc; /* unused */
(void) argv; /* unused */
/* some simple sanity tests of the character widths */
for (c = 0; c <= 0x110000; ++c) {
int cat = utf8proc_get_property(c)->category;
int w = utf8proc_charwidth(c);
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
error += 1;
}
if (w == 0 &&
((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
(cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
(cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
fprintf(stderr, "zero width for symbol-like char %x\n", c);
error += 1;
}
if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) {
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
wcwidth(c), w,
isprint(c) ? "printable" : "non-printable", c);
error += 1;
}
if (!my_isprint(c) && w > 0) {
fprintf(stderr, "non-printing %x had width %d\n", c, w);
error += 1;
}
if (my_unassigned(c) && w != 1) {
fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
error += 1;
}
}
check(!error, "utf8proc_charwidth FAILED %d tests.", error);
/* some simple sanity tests of the character widths */
for (c = 0; c <= 0x110000; ++c) {
int cat = utf8proc_get_property(c)->category;
int w = utf8proc_charwidth(c);
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) &&
w > 0) {
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
error = 1;
}
if (w == 0 &&
((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
(cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
(cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
fprintf(stderr, "zero width for symbol-like char %x\n", c);
error = 1;
}
if (c <= 127 && ((!isprint(c) && w > 0) ||
(isprint(c) && wcwidth(c) != w))) {
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
wcwidth(c), w,
isprint(c) ? "printable" : "non-printable", c);
error = 1;
}
if (!my_isprint(c) && w > 0) {
fprintf(stderr, "non-printing %x had width %d\n", c, w);
error = 1;
}
}
check(!error, "utf8proc_charwidth FAILED tests.");
check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
/* print some other information by compariing with system wcwidth */
printf("Mismatches with system wcwidth (not necessarily errors):\n");
for (c = 0; c <= 0x110000; ++c) {
int w = utf8proc_charwidth(c);
int wc = wcwidth(c);
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
/* lots of these errors for out-of-date system unicode tables */
if (wc == -1 && my_isprint(c) && w > 0) {
updates += 1;
#if 0
printf(" wcwidth(%x) = -1 for printable char\n", c);
#endif
}
if (wc == -1 && !my_isprint(c) && w > 0)
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
if (wc >= 0 && wc != w)
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
}
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n",
updates);
printf("Character-width tests SUCCEEDED.\n");
/* print some other information by compariing with system wcwidth */
printf("Mismatches with system wcwidth (not necessarily errors):\n");
for (c = 0; c <= 0x110000; ++c) {
int w = utf8proc_charwidth(c);
int wc = wcwidth(c);
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
/* lots of these errors for out-of-date system unicode tables */
if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0)
updates += 1;
if (wc == -1 && !my_isprint(c) && w > 0)
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
if (wc >= 0 && wc != w)
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
}
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n", updates);
printf("Character-width tests SUCCEEDED.\n");
return 0;
return 0;
}
......@@ -23,5 +23,6 @@ int main(void)
check(strlen((char*) output) == 6, "incorrect output length");
check(!memcmp(correct, output, 7), "incorrect output data");
free(output);
printf("map_custom tests SUCCEEDED.\n");
return 0;
}
......@@ -19,9 +19,28 @@ static void issue128(void) /* #128 */
free(nfd_out); free(nfc_out);
}
static void issue102(void) /* #128 */
{
utf8proc_uint8_t input[] = {0x58, 0xe2, 0x81, 0xa5, 0x45, 0xcc, 0x80, 0xc2, 0xad, 0xe1, 0xb4, 0xac, 0x00}; /* "X\u2065E\u0300\u00ad\u1d2c" */
utf8proc_uint8_t stripna[] = {0x78, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u00e8a" */
utf8proc_uint8_t correct[] = {0x78, 0xe2, 0x81, 0xa5, 0xc3, 0xa8, 0x61, 0x00}; /* "x\u2065\u00e8a" */
utf8proc_uint8_t *output;
utf8proc_map(input, 0, &output, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE | UTF8PROC_STRIPNA);
printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)stripna);
check(strlen((char*) output) == 4, "incorrect NFKC_Casefold+stripna length");
check(!memcmp(stripna, output, 5), "incorrect NFKC_Casefold+stripna data");
free(output);
output = utf8proc_NFKC_Casefold(input);
printf("NFKC_Casefold \"%s\" -> \"%s\" vs. \"%s\"\n", (char*)input, (char*)output, (char*)correct);
check(strlen((char*) output) == 7, "incorrect NFKC_Casefold length");
check(!memcmp(correct, output, 8), "incorrect NFKC_Casefold data");
}
int main(void)
{
issue128();
issue102();
printf("Misc tests SUCCEEDED.\n");
return 0;
}
......@@ -4,46 +4,57 @@
int main(int argc, char **argv)
{
int i;
int i;
for (i = 1; i < argc; ++i) {
unsigned int c;
if (!strcmp(argv[i], "-V")) {
printf("utf8proc version %s\n", utf8proc_version());
continue;
}
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
const utf8proc_property_t *p = utf8proc_get_property(c);
printf("U+%s:\n"
" category = %s\n"
" combining_class = %d\n"
" bidi_class = %d\n"
" decomp_type = %d\n"
" uppercase_mapping = %x\n"
" lowercase_mapping = %x\n"
" titlecase_mapping = %x\n"
" comb_index = %d\n"
" bidi_mirrored = %d\n"
" comp_exclusion = %d\n"
" ignorable = %d\n"
" control_boundary = %d\n"
" boundclass = %d\n"
" charwidth = %d\n",
argv[i],
utf8proc_category_string(c),
p->combining_class,
p->bidi_class,
p->decomp_type,
utf8proc_toupper(c),
utf8proc_tolower(c),
utf8proc_totitle(c),
p->comb_index,
p->bidi_mirrored,
p->comp_exclusion,
p->ignorable,
p->control_boundary,
p->boundclass,
utf8proc_charwidth(c));
}
return 0;
for (i = 1; i < argc; ++i) {
utf8proc_uint8_t cstr[16], *map;
unsigned int c;
if (!strcmp(argv[i], "-V")) {
printf("utf8proc version %s\n", utf8proc_version());
continue;
}
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
const utf8proc_property_t *p = utf8proc_get_property(c);
if (utf8proc_codepoint_valid(c))
cstr[utf8proc_encode_char(c, cstr)] = 0;
else
strcat((char*)cstr, "N/A");
utf8proc_map(cstr, 0, &map, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD);
printf("U+%s: %s\n"
" category = %s\n"
" combining_class = %d\n"
" bidi_class = %d\n"
" decomp_type = %d\n"
" uppercase_mapping = %x\n"
" lowercase_mapping = %x\n"
" titlecase_mapping = %x\n"
" casefold = %s\n"
" comb_index = %d\n"
" bidi_mirrored = %d\n"
" comp_exclusion = %d\n"
" ignorable = %d\n"
" control_boundary = %d\n"
" boundclass = %d\n"
" charwidth = %d\n",
argv[i], (char*) cstr,
utf8proc_category_string(c),
p->combining_class,
p->bidi_class,
p->decomp_type,
utf8proc_toupper(c),
utf8proc_tolower(c),
utf8proc_totitle(c),
(char *) map,
p->comb_index,
p->bidi_mirrored,
p->comp_exclusion,
p->ignorable,
p->control_boundary,
p->boundclass,
utf8proc_charwidth(c));
free(map);
}
return 0;
}
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
/*
* Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
* Copyright (c) 2018 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany