Commit b0e22197 authored by Mo Zhou's avatar Mo Zhou

New upstream version 2.3.0

parent 65ff1324
language: c
compiler:
- gcc
- clang
language: julia
julia:
- 1.1
notifications:
email: false
include:
- language: julia
julia: 1.1
before_install:
- sudo add-apt-repository ppa:staticfloat/julia-deps -y
- sudo add-apt-repository ppa:staticfloat/juliareleases -y
- sudo apt-get update -qq -y
- sudo apt-get install libpcre3-dev julia fontforge -y
- sudo apt-get install fontforge -y
script:
- make manifest && diff MANIFEST.new MANIFEST
- make check
......@@ -16,7 +15,3 @@ script:
- make clean && git status --ignored --porcelain && test -z "$(git status --ignored --porcelain)"
- (mkdir build_static && cd build_static && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON && make)
- (mkdir build_shared && cd build_shared && cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_SHARED_LIBS=ON && make)
env:
# use JuliaLang caching (https://github.com/staticfloat/cache.julialang.org)
# so that Travis builds do not depend on anyone's flaky servers but our own
- URLCACHE=https://cache.julialang.org/ CFLAGS="-O2 -Werror -Wmissing-prototypes"
......@@ -13,15 +13,14 @@ set(SO_MAJOR 2)
set(SO_MINOR 2)
set(SO_PATCH 0)
if (NOT MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -std=c99 -pedantic -Wall")
endif ()
add_library (utf8proc
utf8proc.c
utf8proc.h
)
# expose header path, for when this is part of a larger cmake project
target_include_directories(utf8proc PUBLIC ../utf8proc)
if (BUILD_SHARED_LIBS)
# Building shared library
else()
......@@ -34,6 +33,13 @@ endif()
target_compile_definitions(utf8proc PRIVATE "UTF8PROC_EXPORTS")
if (NOT MSVC)
set_target_properties(
utf8proc PROPERTIES
COMPILE_FLAGS "-O2 -std=c99 -pedantic -Wall"
)
endif ()
set_target_properties (utf8proc PROPERTIES
POSITION_INDEPENDENT_CODE ON
VERSION "${SO_MAJOR}.${SO_MINOR}.${SO_PATCH}"
......
......@@ -5,3 +5,5 @@ lib/libutf8proc.a
lib/libutf8proc.so -> libutf8proc.so.2.2.0
lib/libutf8proc.so.2 -> libutf8proc.so.2.2.0
lib/libutf8proc.so.2.2.0
lib/pkgconfig/
lib/pkgconfig/libutf8proc.pc
......@@ -5,13 +5,14 @@ AR?=ar
CC?=gcc
INSTALL=install
FIND=find
PERL=perl
# compiler settings
CFLAGS ?= -O2
PICFLAG = -fPIC
C99FLAG = -std=c99
WCFLAGS = -Wall -pedantic
UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES)
UCFLAGS = $(CPPFLAGS) $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS $(UTF8PROC_DEFINES)
# shared-library version MAJOR.MINOR.PATCH ... this may be *different*
# from the utf8proc version number because it indicates ABI compatibility,
......@@ -36,6 +37,10 @@ endif
prefix=/usr/local
libdir=$(prefix)/lib
includedir=$(prefix)/include
pkgconfigdir=$(libdir)/pkgconfig
pkglibdir=$(libdir:$(prefix)/%=%)
pkgincludedir=$(includedir:$(prefix)/%=%)
# meta targets
......@@ -45,6 +50,7 @@ all: libutf8proc.a libutf8proc.$(SHLIB_EXT)
clean:
rm -f utf8proc.o libutf8proc.a libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.$(SHLIB_EXT)
rm -f libutf8proc.pc
ifneq ($(OS),Darwin)
rm -f libutf8proc.so.$(MAJOR)
endif
......@@ -86,12 +92,22 @@ libutf8proc.$(MAJOR).dylib: utf8proc.o
libutf8proc.dylib: libutf8proc.$(MAJOR).dylib
ln -f -s libutf8proc.$(MAJOR).dylib $@
install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT)
libutf8proc.pc: libutf8proc.pc.in
sed \
-e 's#PREFIX#$(prefix)#' \
-e 's#LIBDIR#$(pkglibdir)#' \
-e 's#INCLUDEDIR#$(pkgincludedir)#' \
-e 's#VERSION#$(MAJOR).$(MINOR).$(PATCH)#' \
libutf8proc.pc.in > libutf8proc.pc
install: libutf8proc.a libutf8proc.$(SHLIB_EXT) libutf8proc.$(SHLIB_VERS_EXT) libutf8proc.pc
mkdir -m 755 -p $(DESTDIR)$(includedir)
$(INSTALL) -m 644 utf8proc.h $(DESTDIR)$(includedir)
mkdir -m 755 -p $(DESTDIR)$(libdir)
$(INSTALL) -m 644 libutf8proc.a $(DESTDIR)$(libdir)
$(INSTALL) -m 755 libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)
mkdir -m 755 -p $(DESTDIR)$(pkgconfigdir)
$(INSTALL) -m 644 libutf8proc.pc $(DESTDIR)$(pkgconfigdir)/libutf8proc.pc
ln -f -s libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)/libutf8proc.$(SHLIB_EXT)
ifneq ($(OS),Darwin)
ln -f -s libutf8proc.$(SHLIB_VERS_EXT) $(DESTDIR)$(libdir)/libutf8proc.so.$(MAJOR)
......@@ -115,31 +131,31 @@ test/tests.o: test/tests.c test/tests.h utf8proc.h
$(CC) $(UCFLAGS) -c -o test/tests.o test/tests.c
test/normtest: test/normtest.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/normtest.c test/tests.o utf8proc.o -o $@
$(CC) $(UCFLAGS) $(LDFLAGS) test/normtest.c test/tests.o utf8proc.o -o $@
test/graphemetest: test/graphemetest.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/graphemetest.c test/tests.o utf8proc.o -o $@
$(CC) $(UCFLAGS) $(LDFLAGS) test/graphemetest.c test/tests.o utf8proc.o -o $@
test/printproperty: test/printproperty.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/printproperty.c test/tests.o utf8proc.o -o $@
$(CC) $(UCFLAGS) $(LDFLAGS) test/printproperty.c test/tests.o utf8proc.o -o $@
test/charwidth: test/charwidth.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/charwidth.c test/tests.o utf8proc.o -o $@
$(CC) $(UCFLAGS) $(LDFLAGS) test/charwidth.c test/tests.o utf8proc.o -o $@
test/valid: test/valid.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/valid.c test/tests.o utf8proc.o -o $@
$(CC) $(UCFLAGS) $(LDFLAGS) test/valid.c test/tests.o utf8proc.o -o $@
test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/iterate.c test/tests.o utf8proc.o -o $@
$(CC) $(UCFLAGS) $(LDFLAGS) test/iterate.c test/tests.o utf8proc.o -o $@
test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/case.c test/tests.o utf8proc.o -o $@
$(CC) $(UCFLAGS) $(LDFLAGS) test/case.c test/tests.o utf8proc.o -o $@
test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
$(CC) $(UCFLAGS) $(LDFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/misc.c test/tests.o utf8proc.o -o $@
$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
......
# utf8proc release history #
## Version 2.3 ##
2019-03-30
- Unicode 12 support ([#148]).
- New function `utf8proc_unicode_version` to return the supported Unicode version ([#151]).
- Simpler character-width computation that no longer uses GNU Unifont metrics: East-Asian wide
characters have width 2, and all other printable characters have width 1 ([#150]).
- Fix `CHARBOUND` option for `utf8proc_map` to preserve U+FFFE and U+FFFF non-characters ([#149]).
- Various build-system improvements ([#141], [#142], [#147]).
## Version 2.2 ##
2018-07-24
......@@ -301,45 +316,52 @@ Release of version 1.0.1
2006-06-02: initial release of version 0.1
[#6]: https://github.com/JuliaLang/utf8proc/issues/6
[#13]: https://github.com/JuliaLang/utf8proc/issues/13
[#17]: https://github.com/JuliaLang/utf8proc/issues/17
[#20]: https://github.com/JuliaLang/utf8proc/issues/20
[#22]: https://github.com/JuliaLang/utf8proc/issues/22
[#24]: https://github.com/JuliaLang/utf8proc/issues/24
[#27]: https://github.com/JuliaLang/utf8proc/issues/27
[#28]: https://github.com/JuliaLang/utf8proc/issues/28
[#29]: https://github.com/JuliaLang/utf8proc/issues/29
[#32]: https://github.com/JuliaLang/utf8proc/issues/32
[#35]: https://github.com/JuliaLang/utf8proc/issues/35
[#40]: https://github.com/JuliaLang/utf8proc/issues/40
[#43]: https://github.com/JuliaLang/utf8proc/issues/43
[#45]: https://github.com/JuliaLang/utf8proc/issues/45
[#47]: https://github.com/JuliaLang/utf8proc/issues/47
[#51]: https://github.com/JuliaLang/utf8proc/issues/51
[#55]: https://github.com/JuliaLang/utf8proc/issues/55
[#58]: https://github.com/JuliaLang/utf8proc/issues/58
[#62]: https://github.com/JuliaLang/utf8proc/issues/62
[#66]: https://github.com/JuliaLang/utf8proc/issues/66
[#68]: https://github.com/JuliaLang/utf8proc/issues/68
[#70]: https://github.com/JuliaLang/utf8proc/issues/70
[#77]: https://github.com/JuliaLang/utf8proc/issues/77
[#78]: https://github.com/JuliaLang/utf8proc/issues/78
[#79]: https://github.com/JuliaLang/utf8proc/issues/79
[#80]: https://github.com/JuliaLang/utf8proc/issues/80
[#84]: https://github.com/JuliaLang/utf8proc/issues/84
[#88]: https://github.com/JuliaLang/utf8proc/issues/88
[#89]: https://github.com/JuliaLang/utf8proc/issues/89
[#90]: https://github.com/JuliaLang/utf8proc/issues/90
[#94]: https://github.com/JuliaLang/utf8proc/issues/94
[#99]: https://github.com/JuliaLang/utf8proc/issues/99
[#113]: https://github.com/JuliaLang/utf8proc/issues/113
[#121]: https://github.com/JuliaLang/utf8proc/issues/121
[#123]: https://github.com/JuliaLang/utf8proc/issues/123
[#125]: https://github.com/JuliaLang/utf8proc/issues/125
[#128]: https://github.com/JuliaLang/utf8proc/issues/128
[#132]: https://github.com/JuliaLang/utf8proc/issues/132
[#133]: https://github.com/JuliaLang/utf8proc/issues/133
[#134]: https://github.com/JuliaLang/utf8proc/issues/134
[#135]: https://github.com/JuliaLang/utf8proc/issues/135
[#140]: https://github.com/JuliaLang/utf8proc/issues/140
[#6]: https://github.com/JuliaLang/julia/issues/6
[#13]: https://github.com/JuliaLang/julia/issues/13
[#17]: https://github.com/JuliaLang/julia/issues/17
[#20]: https://github.com/JuliaLang/julia/issues/20
[#22]: https://github.com/JuliaLang/julia/issues/22
[#24]: https://github.com/JuliaLang/julia/issues/24
[#27]: https://github.com/JuliaLang/julia/issues/27
[#28]: https://github.com/JuliaLang/julia/issues/28
[#29]: https://github.com/JuliaLang/julia/issues/29
[#32]: https://github.com/JuliaLang/julia/issues/32
[#35]: https://github.com/JuliaLang/julia/issues/35
[#40]: https://github.com/JuliaLang/julia/issues/40
[#43]: https://github.com/JuliaLang/julia/issues/43
[#45]: https://github.com/JuliaLang/julia/issues/45
[#47]: https://github.com/JuliaLang/julia/issues/47
[#51]: https://github.com/JuliaLang/julia/issues/51
[#55]: https://github.com/JuliaLang/julia/issues/55
[#58]: https://github.com/JuliaLang/julia/issues/58
[#62]: https://github.com/JuliaLang/julia/issues/62
[#66]: https://github.com/JuliaLang/julia/issues/66
[#68]: https://github.com/JuliaLang/julia/issues/68
[#70]: https://github.com/JuliaLang/julia/issues/70
[#77]: https://github.com/JuliaLang/julia/issues/77
[#78]: https://github.com/JuliaLang/julia/issues/78
[#79]: https://github.com/JuliaLang/julia/issues/79
[#80]: https://github.com/JuliaLang/julia/issues/80
[#84]: https://github.com/JuliaLang/julia/issues/84
[#88]: https://github.com/JuliaLang/julia/issues/88
[#89]: https://github.com/JuliaLang/julia/issues/89
[#90]: https://github.com/JuliaLang/julia/issues/90
[#94]: https://github.com/JuliaLang/julia/issues/94
[#99]: https://github.com/JuliaLang/julia/issues/99
[#113]: https://github.com/JuliaLang/julia/issues/113
[#121]: https://github.com/JuliaLang/julia/issues/121
[#123]: https://github.com/JuliaLang/julia/issues/123
[#125]: https://github.com/JuliaLang/julia/issues/125
[#128]: https://github.com/JuliaLang/julia/issues/128
[#132]: https://github.com/JuliaLang/julia/issues/132
[#133]: https://github.com/JuliaLang/julia/issues/133
[#134]: https://github.com/JuliaLang/julia/issues/134
[#135]: https://github.com/JuliaLang/julia/issues/135
[#140]: https://github.com/JuliaLang/julia/issues/140
[#141]: https://github.com/JuliaLang/julia/issues/141
[#142]: https://github.com/JuliaLang/julia/issues/142
[#147]: https://github.com/JuliaLang/julia/issues/147
[#148]: https://github.com/JuliaLang/julia/issues/148
[#149]: https://github.com/JuliaLang/julia/issues/149
[#150]: https://github.com/JuliaLang/julia/issues/150
[#151]: https://github.com/JuliaLang/julia/issues/151
......@@ -2,7 +2,7 @@
[![Travis CI Status](https://travis-ci.org/JuliaStrings/utf8proc.png)](https://travis-ci.org/JuliaStrings/utf8proc)
[![AppVeyor status](https://ci.appveyor.com/api/projects/status/ivaa0v6ikxrmm5r6?svg=true)](https://ci.appveyor.com/project/StevenGJohnson/utf8proc)
[utf8proc](http://julialang.org/utf8proc/) is a small, clean C
[utf8proc](http://juliastrings.github.io/utf8proc/) is a small, clean C
library that provides Unicode normalization, case-folding, and other
operations for data in the [UTF-8
encoding](http://en.wikipedia.org/wiki/UTF-8). It was [initially
......@@ -39,7 +39,7 @@ The C library is found in this directory after successful compilation
and is named `libutf8proc.a` (for the static library) and
`libutf8proc.so` (for the dynamic library).
The Unicode version supported is 11.0.0.
The Unicode version supported is 12.0.0.
For Unicode normalizations, the following options are used:
......
CURL=curl
CC = cc
CFLAGS = -O2 -std=c99 -pedantic -Wall
CFLAGS ?= -O2
CFLAGS += -std=c99 -pedantic -Wall
all: bench
......
# Unicode data generation rules. Except for the test data files, most
# users will not use these Makefile rules, which are primarily to re-generate
# unicode_data.c when we get a new Unicode version or charwidth data; they
# require ruby, fontforge, and julia to be installed.
# require ruby and julia to be installed.
# programs
CURL=curl
......@@ -9,7 +9,6 @@ RUBY=ruby
PERL=perl
MAKE=make
JULIA=julia
FONTFORGE=fontforge
CURLFLAGS = --retry 5 --location
.PHONY: clean
......@@ -19,23 +18,11 @@ CURLFLAGS = --retry 5 --location
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
$(RUBY) data_generator.rb < UnicodeData.txt > $@
# GNU Unifont version for font metric calculations:
UNIFONT_VERSION=11.0.01
unifont.ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
unifont_upper.ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont_upper-$(UNIFONT_VERSION).ttf
%.sfd: %.ttf
$(FONTFORGE) -lang=ff -c "Open(\"$<\");Save(\"$@\");Quit(0);"
CharWidths.txt: charwidths.jl unifont.sfd unifont_upper.sfd EastAsianWidth.txt
CharWidths.txt: charwidths.jl EastAsianWidth.txt
$(JULIA) charwidths.jl > $@
# Unicode data version
UNICODE_VERSION=11.0.0
# Unicode data version (must also update utf8proc_unicode_version function)
UNICODE_VERSION=12.0.0
UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
......@@ -65,5 +52,5 @@ emoji-data.txt:
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://unicode.org/Public/emoji/`echo $(UNICODE_VERSION) | cut -d. -f1-2`/emoji-data.txt
clean:
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd emoji-data.txt
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
rm -f utf8proc_data.c.new
# Following work by @jiahao, we compute character widths using a combination of
# * advance widths from GNU Unifont (advance width 512 = 1 en)
# * character category
# * UAX 11: East Asian Width
# * a few exceptions as needed
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
#
# We used to also use data from GNU Unifont, but that has proven unreliable
# and unlikely to match widths assumed by terminals.
#
# Requires Julia (obviously) and FontForge.
#############################################################################
......@@ -14,73 +17,64 @@ CharWidths = Dict{Int,Int}()
# to minimize bootstrapping complexity when a new version of Unicode comes out.
catcode(c) = ccall((:utf8proc_category,"../libutf8proc"), Cint, (Int32,), c)
# use Base.UTF8proc module to get category codes constants, since
# we won't change these in utf8proc.
import Base.UTF8proc
# utf8proc category constants (must match h)
const UTF8PROC_CATEGORY_CN = 0
const UTF8PROC_CATEGORY_LU = 1
const UTF8PROC_CATEGORY_LL = 2
const UTF8PROC_CATEGORY_LT = 3
const UTF8PROC_CATEGORY_LM = 4
const UTF8PROC_CATEGORY_LO = 5
const UTF8PROC_CATEGORY_MN = 6
const UTF8PROC_CATEGORY_MC = 7
const UTF8PROC_CATEGORY_ME = 8
const UTF8PROC_CATEGORY_ND = 9
const UTF8PROC_CATEGORY_NL = 10
const UTF8PROC_CATEGORY_NO = 11
const UTF8PROC_CATEGORY_PC = 12
const UTF8PROC_CATEGORY_PD = 13
const UTF8PROC_CATEGORY_PS = 14
const UTF8PROC_CATEGORY_PE = 15
const UTF8PROC_CATEGORY_PI = 16
const UTF8PROC_CATEGORY_PF = 17
const UTF8PROC_CATEGORY_PO = 18
const UTF8PROC_CATEGORY_SM = 19
const UTF8PROC_CATEGORY_SC = 20
const UTF8PROC_CATEGORY_SK = 21
const UTF8PROC_CATEGORY_SO = 22
const UTF8PROC_CATEGORY_ZS = 23
const UTF8PROC_CATEGORY_ZL = 24
const UTF8PROC_CATEGORY_ZP = 25
const UTF8PROC_CATEGORY_CC = 26
const UTF8PROC_CATEGORY_CF = 27
const UTF8PROC_CATEGORY_CS = 28
const UTF8PROC_CATEGORY_CO = 29
#############################################################################
# Use a default width of 1 for all character categories that are
# letter/symbol/number-like, as well as for unassigned/private-use chars.
# This can be overriden by Unifont or UAX 11
# This can be overriden by UAX 11
# below, but provides a useful nonzero fallback for new codepoints when
# a new Unicode version has been released but Unifont hasn't been updated yet.
zerowidth = Set{Int}() # categories that may contain zero-width chars
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_SK)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZS)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZL)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
push!(zerowidth, UTF8PROC_CATEGORY_MN)
push!(zerowidth, UTF8PROC_CATEGORY_MC)
push!(zerowidth, UTF8PROC_CATEGORY_ME)
push!(zerowidth, UTF8PROC_CATEGORY_SK)
push!(zerowidth, UTF8PROC_CATEGORY_ZL)
push!(zerowidth, UTF8PROC_CATEGORY_ZP)
push!(zerowidth, UTF8PROC_CATEGORY_CC)
push!(zerowidth, UTF8PROC_CATEGORY_CF)
push!(zerowidth, UTF8PROC_CATEGORY_CS)
for c in 0x0000:0x110000
if catcode(c) zerowidth
CharWidths[c] = 1
end
end
#############################################################################
# Widths from GNU Unifont
#Read sfdfile for character widths
function parsesfd(filename::AbstractString, CharWidths::Dict{Int,Int}=Dict{Int,Int}())
state=:seekchar
lineno = 0
codepoint = width = nothing
for line in readlines(open(filename))
lineno += 1
if state==:seekchar #StartChar: nonmarkingreturn
if contains(line, "StartChar: ")
codepoint = nothing
width = nothing
state = :readdata
end
elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024
contains(line, "Encoding:") && (codepoint = parse(Int, split(line)[3]))
contains(line, "Width:") && (width = parse(Int, split(line)[2]))
if codepoint!=nothing && width!=nothing && codepoint >= 0
w=div(width, 512) # 512 units to the en
if w > 0
# only add nonzero widths, since (1) the default is zero
# and (2) this circumvents some apparent bugs in Unifont
# (https://savannah.gnu.org/bugs/index.php?45395)
CharWidths[codepoint] = w
end
state = :seekchar
end
end
end
CharWidths
end
CharWidths=parsesfd("unifont.sfd", CharWidths)
CharWidths=parsesfd("unifont_upper.sfd", CharWidths)
#############################################################################
# Widths from UAX #11: East Asian Width
# .. these take precedence over the Unifont width for all codepoints
# .. these take precedence for all codepoints
# listed explicitly as wide/full/narrow/half-width
for line in readlines(open("EastAsianWidth.txt"))
......@@ -118,14 +112,14 @@ for c in keys(CharWidths)
# (some of these, like U+0601, can have a width in some cases
# but normally act like prepended combining marks. U+fff9 etc
# are also odd, but have zero width in typical terminal contexts)
if cat==UTF8proc.UTF8PROC_CATEGORY_CF
if cat==UTF8PROC_CATEGORY_CF
CharWidths[c]=0
end
# Unifont has nonzero width for a number of non-spacing combining
# characters, e.g. (in 7.0.06): f84,17b4,17b5,180b,180d,2d7f, and
# the variation selectors
if cat==UTF8proc.UTF8PROC_CATEGORY_MN
if cat==UTF8PROC_CATEGORY_MN
CharWidths[c]=0
end
......@@ -133,12 +127,12 @@ for c in keys(CharWidths)
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
# but since these are nonstandard it seems questionable to use Unifont metrics;
# if they are printed as the replacement character U+FFFD they will have width 1).
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
if cat==UTF8PROC_CATEGORY_CO || cat==UTF8PROC_CATEGORY_CN
CharWidths[c]=1
end
# for some reason, Unifont has width-2 glyphs for ASCII control chars
if cat==UTF8proc.UTF8PROC_CATEGORY_CC
if cat==UTF8PROC_CATEGORY_CC
CharWidths[c]=0
end
end
......@@ -152,24 +146,15 @@ CharWidths[0x00ad]=1
CharWidths[0x2028]=0
CharWidths[0x2029]=0
#By definition, should be narrow = width of 1 en space
#0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/
CharWidths[0x202f]=1
#By definition, should be wide = width of 1 em space
#0x002001 ' ' category: Zs name: EM QUAD/
#0x002003 ' ' category: Zs name: EM SPACE/
CharWidths[0x2001]=2
CharWidths[0x2003]=2
#############################################################################
# Output (to a file or pipe) for processing by data_generator.rb,
# encoded as a sequence of intervals.
firstc = 0x000000
lastv = 0
uhex(c) = uppercase(hex(c,4))
uhex(c) = uppercase(string(c,base=16,pad=4))
for c in 0x0000:0x110000
global firstc, lastv
v = get(CharWidths, c, 0)
if v != lastv || c == 0x110000
v < 4 || error("invalid charwidth $v for $c")
......
prefix=PREFIX
exec_prefix=${prefix}
libdir=${prefix}/LIBDIR
includedir=${prefix}/INCLUDEDIR
Name: libutf8proc
Description: UTF8 processing
Version: VERSION
Libs: -L${libdir} -lutf8proc
Cflags: -I${includedir} -DUTF8PROC_EXPORTS
......@@ -7,17 +7,17 @@ int main(int argc, char **argv)
FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
utf8proc_uint8_t src[1024];
int len;
check(f != NULL, "error opening GraphemeBreakTest.txt");
while (getline(&buf, &bufsize, f) > 0) {
size_t bi = 0, si = 0;
lineno += 1;
if (lineno % 100 == 0)
printf("checking line %zd...\n", lineno);
if (buf[0] == '#') continue;
while (buf[bi]) {
bi = skipspaces(buf, bi);
if (buf[bi] == '/') { /* grapheme break */
......@@ -39,7 +39,7 @@ int main(int argc, char **argv)
if (si && src[si-1] == '/')
--si; /* no break after final grapheme */
src[si] = 0; /* NUL-terminate */
if (si) {
utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
size_t i = 0, j = 0;
......@@ -70,5 +70,17 @@ int main(int argc, char **argv)
}
fclose(f);
printf("Passed tests after %zd lines!\n", lineno);
/* issue 144 */
{
utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */
utf8proc_ssize_t glen;
utf8proc_uint8_t *g;
glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND);
check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks");
free(g);
};
return 0;
}
......@@ -41,6 +41,10 @@ int main(void)
{
issue128();
issue102();
#ifdef UNICODE_VERSION
printf("Unicode version: Makefile has %s, has API %s\n", UNICODE_VERSION, utf8proc_unicode_version());
check(!strcmp(UNICODE_VERSION, utf8proc_unicode_version()), "utf8proc_unicode_version mismatch");
#endif
printf("Misc tests SUCCEEDED.\n");
return 0;
}
......@@ -100,6 +100,10 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
}
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
return "12.0.0";
}
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
switch (errcode) {
case UTF8PROC_ERROR_NOMEM:
......@@ -196,9 +200,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
} else return 0;
}
/* internal "unsafe" version that does not check whether uc is in range */
static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
/* internal version used for inserting 0xff bytes between graphemes */
static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
if (uc < 0x00) {
if (uc == -1) { /* internal value used for grapheme breaks */
dst[0] = (utf8proc_uint8_t)0xFF;
return 1;
}
return 0;
} else if (uc < 0x80) {
dst[0] = (utf8proc_uint8_t)uc;
......@@ -207,12 +215,6 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 2;
} else if (uc == 0xFFFF) {
dst[0] = (utf8proc_uint8_t)0xFF;
return 1;
} else if (uc == 0xFFFE) {
dst[0] = (utf8proc_uint8_t)0xFE;
return 1;
} else if (uc < 0x10000) {
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
......@@ -480,7 +482,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
int tbc = property->boundclass;
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
if (boundary) {
if (bufsize >= 1) dst[0] = 0xFFFF;
if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
if (bufsize >= 2) dst[1] = uc;
return 2;
}
......@@ -686,7 +688,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
if (options & UTF8PROC_CHARBOUND) {
for (rpos = 0; rpos < length; rpos++) {
uc = buffer[rpos];
wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
}
} else {
for (rpos = 0; rpos < length; rpos++) {
......
......@@ -408,6 +408,11 @@ UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256];
*/
UTF8PROC_DLLEXPORT const char *utf8proc_version(void);
/**
* Returns the utf8proc supported Unicode version as a string MAJOR.MINOR.PATCH.
*/
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void);
/**
* Returns an informative error string for the given utf8proc error code
* (e.g. the error codes returned by @ref utf8proc_map).
......@@ -595,7 +600,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
* matching the rules in Unicode 8.0.0.
*
* @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
* be called IN ORDER on ALL potential breaks in a string.
* be called IN ORDER on ALL potential breaks in a string. However, it
* is safe to reset the state to zero after a grapheme break.
*/
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment