Commit 50e9ab12 authored by Peter Stephenson's avatar Peter Stephenson

22556: Multibyte separators and delimiters

parent 6ca7b6ab
2006-07-24 Peter Stephenson <p.w.stephenson@ntlworld.com>
* 22556: Doc/Zsh/builtins.yo, Functions/Zle/insert-composed-char,
Src/builtin.c, Src/pattern.c, Src/subst.c, Src/utils.c, Src/zsh.h,
Src/ztype.h, Src/Zle/zle.h, Src/Zle/zle_main.c,
Test/D04parameter.ztst, Test/D07multibyte.ztst: Multibyte
separators and delimiters.
2006-07-18 Clint Adams <clint@zsh.org>
* 22554: Jesse Weinstein: Completion/Unix/Command/_vorbiscomment:
......
......@@ -1003,6 +1003,10 @@ Read only one (or var(num)) characters. All are assigned to the first
var(name), without word splitting. This flag is ignored when tt(-q) is
present. Input is read from the terminal unless one of tt(-u) or tt(-p)
is present. This option may also be used within zle widgets.
Note that despite the mnemonic `key' this option does read full
characters, which may consist of multiple bytes if the option
tt(MULTIBYTE) is set.
)
item(tt(-z))(
Read one entry from the editor buffer stack and assign it to the first
......
......@@ -128,7 +128,7 @@
# 'm Macron
# '' Acute
emulate -LR zsh
emulate -L zsh
setopt cbases extendedglob printeightbit
local accent basechar ochar error
......@@ -165,7 +165,8 @@ else
fi
local -A charmap
charmap=(${=zsh_accented_chars[$accent]})
# just in case someone is monkeying with IFS...
charmap=(${(s. .)zsh_accented_chars[$accent]})
if [[ ${#charmap} -eq 0 || -z $charmap[$basechar] ]]; then
$error "Combination ${basechar}${accent} is not available."
......
......@@ -62,11 +62,11 @@ typedef wint_t ZLE_INT_T;
#define ZC_iblank wcsiblank
#define ZC_icntrl iswcntrl
#define ZC_idigit iswdigit
#define ZC_iident wcsiident
#define ZC_iident(x) wcsitype((x), IIDENT)
#define ZC_ilower iswlower
#define ZC_inblank iswspace
#define ZC_iupper iswupper
#define ZC_iword wcsiword
#define ZC_iword(x) wcsitype((x), IWORD)
#define ZC_tolower towlower
#define ZC_toupper towupper
......
......@@ -1290,32 +1290,40 @@ bin_vared(char *name, char **args, Options ops, UNUSED(int func))
char **arr = getarrvalue(v), **aptr, **tmparr, **tptr;
tptr = tmparr = (char **)zhalloc(sizeof(char *)*(arrlen(arr)+1));
for (aptr = arr; *aptr; aptr++) {
int sepcount = 0;
int sepcount = 0, clen;
convchar_t c;
/*
* See if this word contains a separator character
* or backslash
*/
for (t = *aptr; *t; t++) {
if (*t == Meta) {
if (isep(t[1] ^ 32))
sepcount++;
MB_METACHARINIT();
for (t = *aptr; *t; ) {
if (*t == '\\') {
t++;
} else if (isep(*t) || *t == '\\')
sepcount++;
} else {
t += MB_METACHARLENCONV(t, &c);
if (MB_ZISTYPE(c, ISEP))
sepcount++;
}
}
if (sepcount) {
/* Yes, so allocate enough space to quote it. */
char *newstr, *nptr;
newstr = zhalloc(strlen(*aptr)+sepcount+1);
/* Go through string quoting separators */
MB_METACHARINIT();
for (t = *aptr, nptr = newstr; *t; ) {
if (*t == Meta) {
if (isep(t[1] ^ 32))
*nptr++ = '\\';
*nptr++ = *t++;
} else if (isep(*t) || *t == '\\')
if (*t == '\\') {
*nptr++ = '\\';
*nptr++ = *t++;
*nptr++ = *t++;
} else {
clen = MB_METACHARLENCONV(t, &c);
if (MB_ZISTYPE(c, ISEP))
*nptr++ = '\\';
while (clen--)
*nptr++ = *t++;
}
}
*nptr = '\0';
/* Stick this into the array of words to join up */
......
This diff is collapsed.
......@@ -318,7 +318,7 @@ metacharinc(char **x)
inchar = *inptr++;
}
*x = inptr;
return (wchar_t)inchar;
return (wchar_t)STOUC(inchar);
}
while (*inptr) {
......@@ -352,12 +352,14 @@ typedef int patint_t;
#define PEOF EOF
#define METACHARINC(x) ((void)((x) += (*(x) == Meta) ? 2 : 1))
#endif
/*
* Return unmetafied char from string (x is any char *)
* Return unmetafied char from string (x is any char *).
* Used with MULTIBYTE_SUPPORT if the GF_MULTIBYTE is not
* in effect.
*/
#define UNMETA(x) (*(x) == Meta ? (x)[1] ^ 32 : *(x))
#endif
/* Add n more characters, ensuring there is enough space. */
......@@ -1575,7 +1577,7 @@ charref(char *x, char *y)
size_t ret;
if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(*x) & 0x80))
return (wchar_t) *x;
return (wchar_t) STOUC(*x);
ret = mbrtowc(&wc, x, y-x, &shiftstate);
......@@ -1583,7 +1585,7 @@ charref(char *x, char *y)
/* Error. Treat as single byte. */
/* Reset the shift state for next time. */
memset(&shiftstate, 0, sizeof(shiftstate));
return (wchar_t) *x;
return (wchar_t) STOUC(*x);
}
return wc;
......@@ -1626,7 +1628,7 @@ charrefinc(char **x, char *y)
size_t ret;
if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80))
return (wchar_t) *(*x)++;
return (wchar_t) STOUC(*(*x)++);
ret = mbrtowc(&wc, *x, y-*x, &shiftstate);
......@@ -1634,7 +1636,7 @@ charrefinc(char **x, char *y)
/* Error. Treat as single byte. */
/* Reset the shift state for next time. */
memset(&shiftstate, 0, sizeof(shiftstate));
return (wchar_t) *(*x)++;
return (wchar_t) STOUC(*(*x)++);
}
/* Nulls here are normal characters */
......@@ -2222,20 +2224,33 @@ patmatch(Upat prog)
}
break;
case P_ANYOF:
if (patinput == patinend ||
!patmatchrange((char *)P_OPERAND(scan),
CHARREF(patinput, patinend)))
fail = 1;
else
CHARINC(patinput, patinend);
break;
case P_ANYBUT:
if (patinput == patinend ||
patmatchrange((char *)P_OPERAND(scan),
CHARREF(patinput, patinend)))
if (patinput == patinend)
fail = 1;
else
CHARINC(patinput, patinend);
else {
#ifdef MULTIBYTE_SUPPORT
wchar_t cr = CHARREF(patinput, patinend);
char *scanop = (char *)P_OPERAND(scan);
if (patglobflags & GF_MULTIBYTE) {
if (mb_patmatchrange(scanop, cr) ^
(P_OP(scan) == P_ANYOF))
fail = 1;
else
CHARINC(patinput, patinend);
} else if (patmatchrange(scanop, (int)cr) ^
(P_OP(scan) == P_ANYOF))
fail = 1;
else
CHARINC(patinput, patinend);
#else
if (patmatchrange((char *)P_OPERAND(scan),
CHARREF(patinput, patinend)) ^
(P_OP(scan) == P_ANYOF))
fail = 1;
else
CHARINC(patinput, patinend);
#endif
}
break;
case P_NUMRNG:
case P_NUMFROM:
......@@ -2923,7 +2938,7 @@ patmatch(Upat prog)
/**/
static int
patmatchrange(char *range, wchar_t ch)
mb_patmatchrange(char *range, wchar_t ch)
{
wchar_t r1, r2;
......@@ -2994,21 +3009,20 @@ patmatchrange(char *range, wchar_t ch)
return 1;
break;
case PP_IDENT:
if (wcsiident(ch))
if (wcsitype(ch, IIDENT))
return 1;
break;
case PP_IFS:
/* TODO */
if (isep(ch))
if (wcsitype(ch, ISEP))
return 1;
break;
case PP_IFSSPACE:
/* TODO */
if (iwsep(ch))
/* must be ASCII space character */
if (ch < 128 && iwsep((int)ch))
return 1;
break;
case PP_WORD:
if (wcsiword(ch))
if (wcsitype(ch, IWORD))
return 1;
break;
case PP_RANGE:
......@@ -3031,7 +3045,7 @@ patmatchrange(char *range, wchar_t ch)
}
/**/
#else
#endif
/**/
static int
......@@ -3142,9 +3156,6 @@ patmatchrange(char *range, int ch)
return 0;
}
/**/
#endif
/*
* Repeatedly match something simple and say how many times.
* charstart is an array parallel to that starting at patinput
......@@ -3180,20 +3191,26 @@ static int patrepeat(Upat p, char *charstart)
}
break;
case P_ANYOF:
while (scan < patinend &&
patmatchrange(opnd, CHARREF(scan, patinend))) {
charstart[scan-patinput] = 1;
count++;
CHARINC(scan, patinend);
}
break;
case P_ANYBUT:
while (scan < patinend &&
!patmatchrange(opnd, CHARREF(scan, patinend))) {
while (scan < patinend) {
#ifdef MULTIBYTE_SUPPORT
wchar_t cr = CHARREF(scan, patinend);
if (patglobflags & GF_MULTIBYTE) {
if (mb_patmatchrange(opnd, cr) ^
(P_OP(p) == P_ANYOF))
break;
} else if (patmatchrange(opnd, (int)cr) ^
(P_OP(p) == P_ANYOF))
break;
#else
if (patmatchrange(opnd, CHARREF(scan, patinend)) ^
P_OP(p) == P_ANYOF)
break;
#endif
charstart[scan-patinput] = 1;
count++;
CHARINC(scan, patinend);
}
}
break;
#ifdef DEBUG
default:
......
......@@ -316,9 +316,14 @@ multsub(char **s, int split, char ***a, int *isarr, char *sep)
local_list1(foo);
if (split) {
for ( ; *x; x += l+1) {
/*
* This doesn't handle multibyte characters, but we're
* looking for whitespace separators which must be ASCII.
*/
for ( ; *x; x += l) {
char c = (l = *x == Meta) ? x[1] ^ 32 : *x;
if (!iwsep(c))
l++;
if (!iwsep(STOUC(c)))
break;
}
}
......@@ -328,20 +333,35 @@ multsub(char **s, int split, char ***a, int *isarr, char *sep)
if (split) {
LinkNode n = firstnode(&foo);
int inq = 0, inp = 0;
for ( ; *x; x += l+1) {
char c = (l = *x == Meta) ? x[1] ^ 32 : *x;
if (!inq && !inp && isep(c)) {
*x = '\0';
for (x += l+1; *x; x += l+1) {
c = (l = *x == Meta) ? x[1] ^ 32 : *x;
if (!isep(c))
MB_METACHARINIT();
for ( ; *x; x += l) {
int rawc = -1;
convchar_t c;
if (itok(STOUC(*x))) {
/* token, can't be separator, must be single byte */
rawc = *x;
l = 1;
} else {
l = MB_METACHARLENCONV(x, &c);
if (!inq && !inp && MB_ZISTYPE(c, ISEP)) {
*x = '\0';
for (x += l; *x; x += l) {
if (itok(STOUC(*x))) {
/* as above */
rawc = *x;
l = 1;
break;
}
l = MB_METACHARLENCONV(x, &c);
if (!MB_ZISTYPE(c, ISEP))
break;
}
if (!*x)
break;
insertlinknode(&foo, n, (void *)x), incnode(n);
}
if (!*x)
break;
insertlinknode(&foo, n, (void *)x), incnode(n);
}
switch (c) {
switch (rawc) {
case Dnull: /* " */
case Snull: /* ' */
case Tick: /* ` (note: no Qtick!) */
......@@ -357,8 +377,8 @@ multsub(char **s, int split, char ***a, int *isarr, char *sep)
case Bnull: /* \ */
case Bnullkeep:
/* The parser verified the following char's existence. */
x += l+1;
l = *x == Meta;
x += l;
l = MB_METACHARLEN(x);
break;
}
}
......@@ -685,12 +705,14 @@ invinstrpcmp(const void *a, const void *b)
static char *
dopadding(char *str, int prenum, int postnum, char *preone, char *postone, char *premul, char *postmul)
{
char def[3], *ret, *t, *r;
char *def, *ret, *t, *r;
int ls, ls2, lpreone, lpostone, lpremul, lpostmul, lr, f, m, c, cc;
def[0] = *ifs ? *ifs : ' ';
def[1] = *ifs == Meta ? ifs[1] ^ 32 : '\0';
def[2] = '\0';
MB_METACHARINIT();
if (*ifs)
def = dupstrpfx(ifs, MB_METACHARLEN(ifs));
else
def = "";
if (preone && !*preone)
preone = def;
if (postone && !*postone)
......
This diff is collapsed.
......@@ -1925,6 +1925,8 @@ typedef char *(*ZleGetLineFn) _((int *, int *));
#ifdef MULTIBYTE_SUPPORT
#define nicezputs(str, outs) (void)mb_niceformat((str), (outs), NULL, 0)
#define MB_METACHARINIT() mb_metacharinit()
typedef wint_t convchar_t;
#define MB_METACHARLENCONV(str, cp) mb_metacharlenconv((str), (cp))
#define MB_METACHARLEN(str) mb_metacharlenconv(str, NULL)
#define MB_METASTRLEN(str) mb_metastrlen(str)
......@@ -1948,6 +1950,8 @@ typedef char *(*ZleGetLineFn) _((int *, int *));
#else
#define MB_METACHARINIT()
typedef int convchar_t;
#define MB_METACHARLENCONV(str, cp) metacharlenconv((str), (cp))
#define MB_METACHARLEN(str) (*(str) == Meta ? 2 : 1)
#define MB_METASTRLEN(str) ztrlen(str)
......
......@@ -59,6 +59,12 @@
#define iwsep(X) zistype(X,IWSEP)
#define inull(X) zistype(X,INULL)
#ifdef MULTIBYTE_SUPPORT
#define MB_ZISTYPE(X,Y) wcsitype((X),(Y))
#else
#define MB_ZISTYPE(X,Y) zistype((X),(Y))
#endif
#define iascii(X) isascii(STOUC(X))
#define ilower(X) islower(STOUC(X))
#define iprint(X) isprint(STOUC(X))
......
......@@ -725,6 +725,29 @@
>7
>8
# Tests a long-standing bug with joining on metafied characters in IFS
(array=(one two three)
IFS=$'\0'
foo="$array"
for (( i = 1; i <= ${#foo}; i++ )); do
char=${foo[i]}
print $(( #char ))
done)
0:Joining with NULL character from IFS
>111
>110
>101
>0
>116
>119
>111
>0
>116
>104
>114
>101
>101
unset SHLVL
(( SHLVL++ ))
print $SHLVL
......
......@@ -174,3 +174,57 @@
1:POSIX_IDENTIFIERS option
>3
?(eval):1: command not found: hähä=3
foo="Ølaf«Ødd«øpénëd«ån«àpple"
print -l ${(s.«.)foo}
ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
print -l ${=ioh}
print ${(w)#ioh}
0:Splitting with multibyte characters
>Ølaf
>Ødd
>øpénëd
>ån
>àpple
>Ἐν
>ἀρχῇ
>ἦν
>ὁ
>λόγος,
>καὶ
>ὁ
>λόγος
>ἦν
>πρὸς
>τὸν
>θεόν,
>καὶ
>θεὸς
>ἦν
>ὁ
>λόγος.
>17
read -d £ one
read -d £ two
print $one
print $two
0:read with multibyte delimiter
<first£second£
>first
>second
(IFS=«
read -d » -A array
print -l $array)
0:read -A with multibyte IFS
<dominus«illuminatio«mea»ignored
>dominus
>illuminatio
>mea
read -k2 -u0 twochars
print $twochars
0:read multibyte characters
<«»ignored
>«»
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment