From d5c106a95c49508f5e214f2fa174968eee2352fc Mon Sep 17 00:00:00 2001 From: christos Date: Sat, 6 Jun 2015 21:19:07 +0000 Subject: [PATCH] PR/437: Fix handling of invalid unicode characters. tcsh uses the high order bits to encode attributes in the prompt and the high bit in regular characters. Make the drawing routines take an argument indicating if we are drawing the prompt or not, so that we can decide how to deal with the high bits. This solution is the minimum diff and does not allow "large valued" unicode characters to be in the prompt (because they would conflict with the attribute bits). A better solution would be to have a struct for each character so we could encode extra attributes. --- Fixes | 1 + ed.chared.c | 2 +- ed.refresh.c | 54 ++++++++++++++++++++++++++++++++++++++++++------------ ed.xmap.c | 2 +- sh.file.c | 2 +- sh.glob.c | 9 +++++++-- sh.h | 15 ++++++++++++--- sh.hist.c | 2 +- sh.misc.c | 22 ++++++++++++++++++---- tc.func.c | 7 +++++++ tc.nls.c | 40 ++++++++++++++++++++++++++++++---------- tc.nls.h | 3 ++- tc.printf.c | 2 +- tc.str.c | 22 ++++++++++++++++++---- tw.parse.c | 7 ++++++- 15 files changed, 148 insertions(+), 42 deletions(-) diff --git a/Fixes b/Fixes index 7d0ceac..aa779b1 100644 --- a/Fixes +++ b/Fixes @@ -1,3 +1,4 @@ + 2. PR/437: Fix handling of invalid unicode characters. 1. PR/451: Fix error messages containing %c to be always '%c' 41. V6.19.00 - 20150521 diff --git a/ed.chared.c b/ed.chared.c index bade211..c0bd41b 100644 --- a/ed.chared.c +++ b/ed.chared.c @@ -3387,7 +3387,7 @@ e_stuff_char(Char c) (void) Cookedmode(); (void) xwrite(SHIN, "\n", 1); - len = one_wctomb(buf, c & CHAR); + len = one_wctomb(buf, c); for (i = 0; i < len; i++) (void) ioctl(SHIN, TIOCSTI, (ioctl_t) &buf[i]); diff --git a/ed.refresh.c b/ed.refresh.c index 9e6da00..a88c5e5 100644 --- a/ed.refresh.c +++ b/ed.refresh.c @@ -46,7 +46,7 @@ static int vcursor_h, vcursor_v; static int rprompt_h, rprompt_v; static int MakeLiteral (Char *, int, Char); -static int Draw (Char *, int); +static int Draw (Char *, int, int); static void Vdraw (Char, int); static void RefreshPromptpart (Char *); static void update_line (Char *, Char *, int); @@ -159,15 +159,44 @@ static int MakeLiteral(Char *str, int len, Char addlit) return i | LITERAL; } +/* draw char at cp, expand tabs, ctl chars */ static int -Draw(Char *cp, int nocomb) /* draw char at cp, expand tabs, ctl chars */ +Draw(Char *cp, int nocomb, int drawPrompt) { int w, i, lv, lh; Char c, attr; +#ifdef WIDE_STRINGS + if (!drawPrompt) { /* draw command-line */ + attr = 0; + c = *cp; + } else { /* draw prompt */ + /* prompt with attributes(UNDER,BOLD,STANDOUT) */ + if (*cp & (UNDER | BOLD | STANDOUT)) { /* *cp >= STANDOUT */ + + /* example) + * We can't distinguish whether (*cp=)0x02ffffff is + * U+02FFFFFF or U+00FFFFFF|STANDOUT. + * We handle as U+00FFFFFF|STANDOUT, only when drawing prompt. */ + attr = (*cp & ATTRIBUTES); + /* ~(UNDER | BOLD | STANDOUT) = 0xf1ffffff */ + c = *cp & ~(UNDER | BOLD | STANDOUT); + + /* if c is ctrl code, we handle *cp as havnig no attributes */ + if ((c < 0x20 && c >= 0) || c == 0x7f) { + attr = 0; + c = *cp; + } + } else { /* prompt without attributes */ + attr = 0; + c = *cp; + } + } +#else attr = *cp & ~CHAR; c = *cp & CHAR; - w = NLSClassify(c, nocomb); +#endif + w = NLSClassify(c, nocomb, drawPrompt); switch (w) { case NLSCLASS_NL: Vdraw('\0', 0); /* assure end of line */ @@ -201,10 +230,11 @@ Draw(Char *cp, int nocomb) /* draw char at cp, expand tabs, ctl chars */ case NLSCLASS_ILLEGAL2: case NLSCLASS_ILLEGAL3: case NLSCLASS_ILLEGAL4: - Vdraw('\\' | attr, 1); - Vdraw('U' | attr, 1); - Vdraw('+' | attr, 1); - for (i = 8 * NLSCLASS_ILLEGAL_SIZE(w) - 4; i >= 0; i -= 4) + case NLSCLASS_ILLEGAL5: + Vdraw('\\', 1); + Vdraw('U', 1); + Vdraw('+', 1); + for (i = 16 + 4 * (-w-5); i >= 0; i -= 4) Vdraw("0123456789ABCDEF"[(c >> i) & 15] | attr, 1); break; case 0: @@ -302,7 +332,7 @@ RefreshPromptpart(Char *buf) } } else - cp += Draw(cp, cp == buf); + cp += Draw(cp, cp == buf, 1); } } @@ -354,7 +384,7 @@ Refresh(void) cur_v = vcursor_v; Cursor = cp; } - cp += Draw(cp, cp == InputBuf); + cp += Draw(cp, cp == InputBuf, 0); } if (cur_h == -1) { /* if I haven't been set yet, I'm at the end */ @@ -1126,7 +1156,7 @@ RefCursor(void) cp++; continue; } - w = NLSClassify(*cp & CHAR, cp == Prompt); + w = NLSClassify(*cp & CHAR, cp == Prompt, 0); cp++; switch(w) { case NLSCLASS_NL: @@ -1158,7 +1188,7 @@ RefCursor(void) } for (cp = InputBuf; cp < Cursor;) { /* do input buffer to Cursor */ - w = NLSClassify(*cp & CHAR, cp == InputBuf); + w = NLSClassify(*cp & CHAR, cp == InputBuf, 0); cp++; switch(w) { case NLSCLASS_NL: @@ -1251,7 +1281,7 @@ RefPlusOne(int l) } cp = Cursor - l; c = *cp & CHAR; - w = NLSClassify(c, cp == InputBuf); + w = NLSClassify(c, cp == InputBuf, 0); switch(w) { case NLSCLASS_CTRL: PutPlusOne('^', 1); diff --git a/ed.xmap.c b/ed.xmap.c index 6e1d56e..36bce1e 100644 --- a/ed.xmap.c +++ b/ed.xmap.c @@ -743,7 +743,7 @@ unparsestring(const CStr *str, const Char *sep) *b++ = (unsigned char) p; } else if (p == ' ' || (Isprint(p) && !Isspace(p))) - b += one_wctomb((char *)b, p & CHAR); + b += one_wctomb((char *)b, p); else { *b++ = '\\'; *b++ = ((p >> 6) & 7) + '0'; diff --git a/sh.file.c b/sh.file.c index 343b774..3989d8a 100644 --- a/sh.file.c +++ b/sh.file.c @@ -249,7 +249,7 @@ pushback(const Char *string) char buf[MB_LEN_MAX]; size_t i, len; - len = one_wctomb(buf, *p & CHAR); + len = one_wctomb(buf, *p); for (i = 0; i < len; i++) (void) ioctl(SHOUT, TIOCSTI, (ioctl_t) &buf[i]); } diff --git a/sh.glob.c b/sh.glob.c index fc510bf..7d008aa 100644 --- a/sh.glob.c +++ b/sh.glob.c @@ -594,8 +594,13 @@ trim(Char **t) Char *p; while ((p = *t++) != '\0') - while (*p) - *p++ &= TRIM; + while (*p) { +#if INVALID_BYTE != 0 + if ((*p & INVALID_BYTE) != INVALID_BYTE) /* *p < INVALID_BYTE */ +#endif + *p &= TRIM; + p++; + } } int diff --git a/sh.h b/sh.h index e71a24e..75de557 100644 --- a/sh.h +++ b/sh.h @@ -707,14 +707,21 @@ extern struct sigaction parterm; /* Parents terminate catch */ #define ASCII 0177 #ifdef WIDE_STRINGS /* Implies SHORT_STRINGS */ /* 31st char bit used for 'ing (not 32nd, we want all values nonnegative) */ -# define QUOTE 0x40000000 -# define TRIM 0x3FFFFFFF /* Mask to strip quote bit */ +/* + * Notice + * + * By fix for handling unicode name file, 32nd bit is used. + * We need use '&' instead of '> or <' when comparing with INVALID_BYTE etc.. + * Cast to uChar is not recommended, + * becase Char is 4bytes but uChar is 8bytes on I32LP64. */ +# define QUOTE 0x80000000 +# define TRIM 0x7FFFFFFF /* Mask to strip quote bit */ # define UNDER 0x08000000 /* Underline flag */ # define BOLD 0x04000000 /* Bold flag */ # define STANDOUT 0x02000000 /* Standout flag */ # define LITERAL 0x01000000 /* Literal character flag */ # define ATTRIBUTES 0x0F000000 /* The bits used for attributes */ -# define INVALID_BYTE 0x00800000 /* Invalid character on input */ +# define INVALID_BYTE 0xF0000000 /* Invalid character on input */ # ifdef SOLARIS2 # define CHAR 0x30FFFFFF /* Mask to mask out the character */ # else @@ -743,6 +750,8 @@ extern struct sigaction parterm; /* Parents terminate catch */ #endif #define CHAR_DBWIDTH (LITERAL|(LITERAL-1)) +# define MAX_UTF32 0x7FFFFFFF /* max UTF32 is U+7FFFFFFF */ + EXTERN int AsciiOnly; /* If set only 7 bits expected in characters */ /* diff --git a/sh.hist.c b/sh.hist.c index b8f71b7..c0eded5 100644 --- a/sh.hist.c +++ b/sh.hist.c @@ -1199,7 +1199,7 @@ fmthist(int fmt, ptr_t ptr) buf = xmalloc(Strlen(istr) * MB_LEN_MAX + 1); for (p = buf, ip = istr; *ip != '\0'; ip++) - p += one_wctomb(p, CHAR & *ip); + p += one_wctomb(p, *ip); *p = '\0'; xfree(istr); diff --git a/sh.misc.c b/sh.misc.c index 7232b12..233ba5f 100644 --- a/sh.misc.c +++ b/sh.misc.c @@ -450,8 +450,13 @@ strip(Char *cp) if (!cp) return (cp); - while ((*dp++ &= TRIM) != '\0') - continue; + while (*dp != '\0') { +#if INVALID_BYTE != 0 + if ((*dp & INVALID_BYTE) != INVALID_BYTE) /* *dp < INVALID_BYTE */ +#endif + *dp &= TRIM; + dp++; + } return (cp); } @@ -462,8 +467,17 @@ quote(Char *cp) if (!cp) return (cp); - while (*dp != '\0') - *dp++ |= QUOTE; + while (*dp != '\0') { +#ifdef WIDE_STRINGS + if ((*dp & 0xffffff80) == 0) /* *dp < 0x80 */ +#elif defined SHORT_STRINGS + if ((*dp & 0xff80) == 0) /* *dp < 0x80 */ +#else + if ((*dp & 0x80) == 0) /* *dp < 0x80 */ +#endif + *dp |= QUOTE; + dp++; + } return (cp); } diff --git a/tc.func.c b/tc.func.c index 2b28a68..5a909d6 100644 --- a/tc.func.c +++ b/tc.func.c @@ -124,7 +124,14 @@ expand_lex(const struct wordent *sp0, int from, int to) (((*s & TRIM) == '\\') && (prev_c != '\\')))) { Strbuf_append1(&buf, '\\'); } +#if INVALID_BYTE != 0 + if ((*s & INVALID_BYTE) != INVALID_BYTE) /* *s < INVALID_BYTE */ + Strbuf_append1(&buf, *s & TRIM); + else + Strbuf_append1(&buf, *s); +#else Strbuf_append1(&buf, *s & TRIM); +#endif prev_c = *s; } Strbuf_append1(&buf, ' '); diff --git a/tc.nls.c b/tc.nls.c index 2c38f3f..22ad173 100644 --- a/tc.nls.c +++ b/tc.nls.c @@ -64,7 +64,11 @@ NLSWidth(Char c) { # ifdef HAVE_WCWIDTH int l; +#if INVALID_BYTE != 0 + if ((c & INVALID_BYTE) == INVALID_BYTE) /* c >= INVALID_BYTE */ +#else if (c & INVALID_BYTE) +#endif return 1; l = xwcwidth((wchar_t) c); return l >= 0 ? l : 0; @@ -116,12 +120,36 @@ NLSChangeCase(const Char *p, int mode) } int -NLSClassify(Char c, int nocomb) +NLSClassify(Char c, int nocomb, int drawPrompt) { int w; - if (c & INVALID_BYTE) +#ifndef SHORT_STRINGS + if ((c & 0x80) != 0) /* c >= 0x80 */ return NLSCLASS_ILLEGAL; +#endif + if (!drawPrompt) { /* draw command-line */ +#if INVALID_BYTE != 0 + if ((c & INVALID_BYTE) == INVALID_BYTE) /* c >= INVALID_BYTE */ + return NLSCLASS_ILLEGAL; + if ((c & INVALID_BYTE) == QUOTE && (c & 0x80) == 0) /* c >= QUOTE */ + return 1; + if (c >= 0x10000000) /* U+10000000 = FC 90 80 80 80 80 */ + return NLSCLASS_ILLEGAL5; + if (c >= 0x1000000) /* U+1000000 = F9 80 80 80 80 */ + return NLSCLASS_ILLEGAL4; + if (c >= 0x100000) /* U+100000 = F4 80 80 80 */ + return NLSCLASS_ILLEGAL3; +#endif + if (c >= 0x10000) /* U+10000 = F0 90 80 80 */ + return NLSCLASS_ILLEGAL2; + } w = NLSWidth(c); + if (drawPrompt) { /* draw prompt */ + if (w > 0) + return w; + if (w == 0) + return 1; + } if ((w > 0 && !(Iscntrl(c) && (c & CHAR) < 0x100)) || (Isprint(c) && !nocomb)) return w; if (Iscntrl(c) && (c & CHAR) < 0x100) { @@ -131,13 +159,5 @@ NLSClassify(Char c, int nocomb) return NLSCLASS_TAB; return NLSCLASS_CTRL; } -#ifdef WIDE_STRINGS - if (c >= 0x1000000) - return NLSCLASS_ILLEGAL4; - if (c >= 0x10000) - return NLSCLASS_ILLEGAL3; -#endif - if (c >= 0x100) - return NLSCLASS_ILLEGAL2; return NLSCLASS_ILLEGAL; } diff --git a/tc.nls.h b/tc.nls.h index 4d27741..6930682 100644 --- a/tc.nls.h +++ b/tc.nls.h @@ -43,7 +43,7 @@ extern int NLSStringWidth (const Char *); #endif extern Char *NLSChangeCase (const Char *, int); -extern int NLSClassify (Char, int); +extern int NLSClassify (Char, int, int); #define NLSCLASS_CTRL (-1) #define NLSCLASS_TAB (-2) @@ -52,6 +52,7 @@ extern int NLSClassify (Char, int); #define NLSCLASS_ILLEGAL2 (-5) #define NLSCLASS_ILLEGAL3 (-6) #define NLSCLASS_ILLEGAL4 (-7) +#define NLSCLASS_ILLEGAL5 (-8) #define NLSCLASS_ILLEGAL_SIZE(x) (-(x) - (-(NLSCLASS_ILLEGAL) - 1)) diff --git a/tc.printf.c b/tc.printf.c index 7f2612d..c6be145 100644 --- a/tc.printf.c +++ b/tc.printf.c @@ -289,7 +289,7 @@ doprnt(void (*addchar) (int), const char *sfmt, va_list ap) (*addchar) ('\\' | attributes); count++; } - len = one_wctomb(cbuf, *Bp & CHAR); + len = one_wctomb(cbuf, *Bp); for (pos = 0; pos < len; pos++) { (*addchar) ((unsigned char)cbuf[pos] | attributes | (*Bp & ATTRIBUTES)); diff --git a/tc.str.c b/tc.str.c index c407cb8..c2b5ac8 100644 --- a/tc.str.c +++ b/tc.str.c @@ -66,10 +66,24 @@ one_wctomb(char *s, Char wchar) { int len; - if (wchar & INVALID_BYTE) { - s[0] = wchar & 0xFF; +#if INVALID_BYTE != 0 + if ((wchar & INVALID_BYTE) == INVALID_BYTE) { /* wchar >= INVALID_BYTE */ + /* invalid char + * exmaple) + * if wchar = f0000090(=90|INVALID_BYTE), then *s = ffffff90 */ + *s = (char)wchar; len = 1; +#else + if (wchar & (CHAR & INVALID_BYTE)) { + s[0] = wchar & (CHAR & 0xFF); + len = 1; +#endif } else { +#if INVALID_BYTE != 0 + wchar &= MAX_UTF32; +#else + wchar &= CHAR; +#endif #ifdef UTF16_STRINGS if (wchar >= 0x10000) { /* UTF-16 systems can't handle these values directly in calls to @@ -224,7 +238,7 @@ short2str(const Char *src) dst = sdst; edst = &dst[dstsize]; while (*src) { - dst += one_wctomb(dst, *src & CHAR); + dst += one_wctomb(dst, *src); src++; if (dst >= edst) { char *wdst = dst; @@ -544,7 +558,7 @@ short2qstr(const Char *src) dst = &edst[-MALLOC_INCR]; } } - dst += one_wctomb(dst, *src & CHAR); + dst += one_wctomb(dst, *src); src++; if (dst >= edst) { ptrdiff_t i = dst - edst; diff --git a/tw.parse.c b/tw.parse.c index 8309ed8..94982d6 100644 --- a/tw.parse.c +++ b/tw.parse.c @@ -618,7 +618,12 @@ insert_meta(const Char *cp, const Char *cpend, const Char *word, break; wq = w & QUOTE; - w &= ~QUOTE; +#if INVALID_BYTE != 0 + /* add checking INVALID_BYTE for FIX UTF32 */ + if ((w & INVALID_BYTE) != INVALID_BYTE) /* w < INVALID_BYTE */ +#else + w &= ~QUOTE; +#endif if (cmap(w, _ESC | _QF)) wq = QUOTE; /* quotes are always quoted */ -- 2.5.5