new utf decoder
This patch replaces current utf decoder with a new one, which is ~50 lines shorter and should be easier to understand. Parsing 5 and 6 sequences, if necessary, requires trivial modification of UTF_SIZ constant and utfbyte, utfmask, utfmin, utfmax arrays.
This commit is contained in:
parent
71328cbcdc
commit
45b808b88e
1 changed files with 81 additions and 133 deletions
214
st.c
214
st.c
|
@ -55,6 +55,7 @@ char *argv0;
|
||||||
#define XEMBED_FOCUS_OUT 5
|
#define XEMBED_FOCUS_OUT 5
|
||||||
|
|
||||||
/* Arbitrary sizes */
|
/* Arbitrary sizes */
|
||||||
|
#define UTF_INVALID 0xFFFD
|
||||||
#define UTF_SIZ 4
|
#define UTF_SIZ 4
|
||||||
#define ESC_BUF_SIZ (128*UTF_SIZ)
|
#define ESC_BUF_SIZ (128*UTF_SIZ)
|
||||||
#define ESC_ARG_SIZ 16
|
#define ESC_ARG_SIZ 16
|
||||||
|
@ -442,10 +443,12 @@ static void selcopy(void);
|
||||||
static void selscroll(int, int);
|
static void selscroll(int, int);
|
||||||
static void selsnap(int, int *, int *, int);
|
static void selsnap(int, int *, int *, int);
|
||||||
|
|
||||||
static int utf8decode(char *, long *);
|
static size_t utf8decode(char *, long *, size_t);
|
||||||
static int utf8encode(long *, char *);
|
static long utf8decodebyte(char, size_t *);
|
||||||
static int utf8size(char *);
|
static size_t utf8encode(long, char *, size_t);
|
||||||
static int isfullutf8(char *, int);
|
static char utf8encodebyte(long, size_t);
|
||||||
|
static size_t utf8len(char *);
|
||||||
|
static size_t utf8validate(long *, size_t);
|
||||||
|
|
||||||
static ssize_t xwrite(int, char *, size_t);
|
static ssize_t xwrite(int, char *, size_t);
|
||||||
static void *xmalloc(size_t);
|
static void *xmalloc(size_t);
|
||||||
|
@ -490,6 +493,11 @@ static int oldbutton = 3; /* button event on startup: 3 = release */
|
||||||
static char *usedfont = NULL;
|
static char *usedfont = NULL;
|
||||||
static double usedfontsize = 0;
|
static double usedfontsize = 0;
|
||||||
|
|
||||||
|
static uchar utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
|
||||||
|
static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
|
||||||
|
static long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000};
|
||||||
|
static long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
|
||||||
|
|
||||||
/* Font Ring Cache */
|
/* Font Ring Cache */
|
||||||
enum {
|
enum {
|
||||||
FRC_NORMAL,
|
FRC_NORMAL,
|
||||||
|
@ -549,128 +557,69 @@ xstrdup(char *s) {
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
size_t
|
||||||
utf8decode(char *s, long *u) {
|
utf8decode(char *c, long *u, size_t clen) {
|
||||||
uchar c;
|
size_t i, j, len, type;
|
||||||
int i, n, rtn;
|
long udecoded;
|
||||||
|
|
||||||
rtn = 1;
|
*u = UTF_INVALID;
|
||||||
c = *s;
|
if(!clen)
|
||||||
if(~c & 0x80) { /* 0xxxxxxx */
|
return 0;
|
||||||
*u = c;
|
udecoded = utf8decodebyte(c[0], &len);
|
||||||
return rtn;
|
if(!BETWEEN(len, 1, UTF_SIZ))
|
||||||
} else if((c & 0xE0) == 0xC0) { /* 110xxxxx */
|
return 1;
|
||||||
*u = c & 0x1F;
|
for(i = 1, j = 1; i < clen && j < len; ++i, ++j) {
|
||||||
n = 1;
|
udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
|
||||||
} else if((c & 0xF0) == 0xE0) { /* 1110xxxx */
|
if(type != 0)
|
||||||
*u = c & 0x0F;
|
return j;
|
||||||
n = 2;
|
|
||||||
} else if((c & 0xF8) == 0xF0) { /* 11110xxx */
|
|
||||||
*u = c & 0x07;
|
|
||||||
n = 3;
|
|
||||||
} else {
|
|
||||||
goto invalid;
|
|
||||||
}
|
}
|
||||||
|
if(j < len)
|
||||||
for(i = n, ++s; i > 0; --i, ++rtn, ++s) {
|
return 0;
|
||||||
c = *s;
|
*u = udecoded;
|
||||||
if((c & 0xC0) != 0x80) /* 10xxxxxx */
|
utf8validate(u, len);
|
||||||
goto invalid;
|
return len;
|
||||||
*u <<= 6;
|
|
||||||
*u |= c & 0x3F;
|
|
||||||
}
|
|
||||||
|
|
||||||
if((n == 1 && *u < 0x80) ||
|
|
||||||
(n == 2 && *u < 0x800) ||
|
|
||||||
(n == 3 && *u < 0x10000) ||
|
|
||||||
(*u >= 0xD800 && *u <= 0xDFFF)) {
|
|
||||||
goto invalid;
|
|
||||||
}
|
|
||||||
|
|
||||||
return rtn;
|
|
||||||
invalid:
|
|
||||||
*u = 0xFFFD;
|
|
||||||
|
|
||||||
return rtn;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
long
|
||||||
utf8encode(long *u, char *s) {
|
utf8decodebyte(char c, size_t *i) {
|
||||||
uchar *sp;
|
for(*i = 0; *i < LEN(utfmask); ++(*i))
|
||||||
ulong uc;
|
if(((uchar)c & utfmask[*i]) == utfbyte[*i])
|
||||||
int i, n;
|
return (uchar)c & ~utfmask[*i];
|
||||||
|
return 0;
|
||||||
sp = (uchar *)s;
|
|
||||||
uc = *u;
|
|
||||||
if(uc < 0x80) {
|
|
||||||
*sp = uc; /* 0xxxxxxx */
|
|
||||||
return 1;
|
|
||||||
} else if(*u < 0x800) {
|
|
||||||
*sp = (uc >> 6) | 0xC0; /* 110xxxxx */
|
|
||||||
n = 1;
|
|
||||||
} else if(uc < 0x10000) {
|
|
||||||
*sp = (uc >> 12) | 0xE0; /* 1110xxxx */
|
|
||||||
n = 2;
|
|
||||||
} else if(uc <= 0x10FFFF) {
|
|
||||||
*sp = (uc >> 18) | 0xF0; /* 11110xxx */
|
|
||||||
n = 3;
|
|
||||||
} else {
|
|
||||||
goto invalid;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(i=n,++sp; i>0; --i,++sp)
|
|
||||||
*sp = ((uc >> 6*(i-1)) & 0x3F) | 0x80; /* 10xxxxxx */
|
|
||||||
|
|
||||||
return n+1;
|
|
||||||
invalid:
|
|
||||||
/* U+FFFD */
|
|
||||||
*s++ = '\xEF';
|
|
||||||
*s++ = '\xBF';
|
|
||||||
*s = '\xBD';
|
|
||||||
|
|
||||||
return 3;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* use this if your buffer is less than UTF_SIZ, it returns 1 if you can decode
|
size_t
|
||||||
UTF-8 otherwise return 0 */
|
utf8encode(long u, char *c, size_t clen) {
|
||||||
int
|
size_t len, i;
|
||||||
isfullutf8(char *s, int b) {
|
|
||||||
uchar *c1, *c2, *c3;
|
|
||||||
|
|
||||||
c1 = (uchar *)s;
|
len = utf8validate(&u, 0);
|
||||||
c2 = (uchar *)++s;
|
if(clen < len)
|
||||||
c3 = (uchar *)++s;
|
|
||||||
if(b < 1) {
|
|
||||||
return 0;
|
return 0;
|
||||||
} else if((*c1 & 0xE0) == 0xC0 && b == 1) {
|
for(i = len - 1; i != 0; --i) {
|
||||||
return 0;
|
c[i] = utf8encodebyte(u, 0);
|
||||||
} else if((*c1 & 0xF0) == 0xE0 &&
|
u >>= 6;
|
||||||
((b == 1) ||
|
|
||||||
((b == 2) && (*c2 & 0xC0) == 0x80))) {
|
|
||||||
return 0;
|
|
||||||
} else if((*c1 & 0xF8) == 0xF0 &&
|
|
||||||
((b == 1) ||
|
|
||||||
((b == 2) && (*c2 & 0xC0) == 0x80) ||
|
|
||||||
((b == 3) && (*c2 & 0xC0) == 0x80 && (*c3 & 0xC0) == 0x80))) {
|
|
||||||
return 0;
|
|
||||||
} else {
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
c[0] = utf8encodebyte(u, len);
|
||||||
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
char
|
||||||
utf8size(char *s) {
|
utf8encodebyte(long u, size_t i) {
|
||||||
uchar c = *s;
|
return utfbyte[i] | (u & ~utfmask[i]);
|
||||||
|
}
|
||||||
|
|
||||||
if(~c & 0x80) {
|
size_t
|
||||||
return 1;
|
utf8len(char *c) {
|
||||||
} else if((c & 0xE0) == 0xC0) {
|
return utf8decode(c, &(long){0}, UTF_SIZ);
|
||||||
return 2;
|
}
|
||||||
} else if((c & 0xF0) == 0xE0) {
|
|
||||||
return 3;
|
size_t
|
||||||
} else {
|
utf8validate(long *u, size_t i) {
|
||||||
return 4;
|
if(!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
|
||||||
}
|
*u = UTF_INVALID;
|
||||||
|
for(i = 1; *u > utfmax[i]; ++i)
|
||||||
|
;
|
||||||
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -984,7 +933,7 @@ getsel(void) {
|
||||||
if(!selected(x, y) || (gp->mode & ATTR_WDUMMY))
|
if(!selected(x, y) || (gp->mode & ATTR_WDUMMY))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
size = utf8size(gp->c);
|
size = utf8len(gp->c);
|
||||||
memcpy(ptr, gp->c, size);
|
memcpy(ptr, gp->c, size);
|
||||||
ptr += size;
|
ptr += size;
|
||||||
}
|
}
|
||||||
|
@ -1298,7 +1247,7 @@ ttyread(void) {
|
||||||
char *ptr;
|
char *ptr;
|
||||||
char s[UTF_SIZ];
|
char s[UTF_SIZ];
|
||||||
int charsize; /* size of utf8 char in bytes */
|
int charsize; /* size of utf8 char in bytes */
|
||||||
long utf8c;
|
long unicodep;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
/* append read bytes to unprocessed bytes */
|
/* append read bytes to unprocessed bytes */
|
||||||
|
@ -1308,9 +1257,8 @@ ttyread(void) {
|
||||||
/* process every complete utf8 char */
|
/* process every complete utf8 char */
|
||||||
buflen += ret;
|
buflen += ret;
|
||||||
ptr = buf;
|
ptr = buf;
|
||||||
while(buflen >= UTF_SIZ || isfullutf8(ptr,buflen)) {
|
while(charsize = utf8decode(ptr, &unicodep, buflen)) {
|
||||||
charsize = utf8decode(ptr, &utf8c);
|
utf8encode(unicodep, s, UTF_SIZ);
|
||||||
utf8encode(&utf8c, s);
|
|
||||||
tputc(s, charsize);
|
tputc(s, charsize);
|
||||||
ptr += charsize;
|
ptr += charsize;
|
||||||
buflen -= charsize;
|
buflen -= charsize;
|
||||||
|
@ -2414,14 +2362,14 @@ void
|
||||||
tputc(char *c, int len) {
|
tputc(char *c, int len) {
|
||||||
uchar ascii = *c;
|
uchar ascii = *c;
|
||||||
bool control = ascii < '\x20' || ascii == 0177;
|
bool control = ascii < '\x20' || ascii == 0177;
|
||||||
long u8char;
|
long unicodep;
|
||||||
int width;
|
int width;
|
||||||
|
|
||||||
if(len == 1) {
|
if(len == 1) {
|
||||||
width = 1;
|
width = 1;
|
||||||
} else {
|
} else {
|
||||||
utf8decode(c, &u8char);
|
utf8decode(c, &unicodep, UTF_SIZ);
|
||||||
width = wcwidth(u8char);
|
width = wcwidth(unicodep);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(IS_SET(MODE_PRINT))
|
if(IS_SET(MODE_PRINT))
|
||||||
|
@ -3150,7 +3098,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
|
||||||
int frcflags;
|
int frcflags;
|
||||||
int u8fl, u8fblen, u8cblen, doesexist;
|
int u8fl, u8fblen, u8cblen, doesexist;
|
||||||
char *u8c, *u8fs;
|
char *u8c, *u8fs;
|
||||||
long u8char;
|
long unicodep;
|
||||||
Font *font = &dc.font;
|
Font *font = &dc.font;
|
||||||
FcResult fcres;
|
FcResult fcres;
|
||||||
FcPattern *fcpattern, *fontpattern;
|
FcPattern *fcpattern, *fontpattern;
|
||||||
|
@ -3293,11 +3241,11 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
|
||||||
oneatatime = font->width != xw.cw;
|
oneatatime = font->width != xw.cw;
|
||||||
for(;;) {
|
for(;;) {
|
||||||
u8c = s;
|
u8c = s;
|
||||||
u8cblen = utf8decode(s, &u8char);
|
u8cblen = utf8decode(s, &unicodep, UTF_SIZ);
|
||||||
s += u8cblen;
|
s += u8cblen;
|
||||||
bytelen -= u8cblen;
|
bytelen -= u8cblen;
|
||||||
|
|
||||||
doesexist = XftCharExists(xw.dpy, font->match, u8char);
|
doesexist = XftCharExists(xw.dpy, font->match, unicodep);
|
||||||
if(oneatatime || !doesexist || bytelen <= 0) {
|
if(oneatatime || !doesexist || bytelen <= 0) {
|
||||||
if(oneatatime || bytelen <= 0) {
|
if(oneatatime || bytelen <= 0) {
|
||||||
if(doesexist) {
|
if(doesexist) {
|
||||||
|
@ -3329,7 +3277,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
|
||||||
|
|
||||||
/* Search the font cache. */
|
/* Search the font cache. */
|
||||||
for(i = 0; i < frclen; i++) {
|
for(i = 0; i < frclen; i++) {
|
||||||
if(XftCharExists(xw.dpy, frc[i].font, u8char)
|
if(XftCharExists(xw.dpy, frc[i].font, unicodep)
|
||||||
&& frc[i].flags == frcflags) {
|
&& frc[i].flags == frcflags) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -3351,7 +3299,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
|
||||||
fcpattern = FcPatternDuplicate(font->pattern);
|
fcpattern = FcPatternDuplicate(font->pattern);
|
||||||
fccharset = FcCharSetCreate();
|
fccharset = FcCharSetCreate();
|
||||||
|
|
||||||
FcCharSetAddChar(fccharset, u8char);
|
FcCharSetAddChar(fccharset, unicodep);
|
||||||
FcPatternAddCharSet(fcpattern, FC_CHARSET,
|
FcPatternAddCharSet(fcpattern, FC_CHARSET,
|
||||||
fccharset);
|
fccharset);
|
||||||
FcPatternAddBool(fcpattern, FC_SCALABLE,
|
FcPatternAddBool(fcpattern, FC_SCALABLE,
|
||||||
|
@ -3387,7 +3335,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
|
||||||
xp, winy + frc[i].font->ascent,
|
xp, winy + frc[i].font->ascent,
|
||||||
(FcChar8 *)u8c, u8cblen);
|
(FcChar8 *)u8c, u8cblen);
|
||||||
|
|
||||||
xp += xw.cw * wcwidth(u8char);
|
xp += xw.cw * wcwidth(unicodep);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -3430,7 +3378,7 @@ xdrawcursor(void) {
|
||||||
memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ);
|
memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ);
|
||||||
|
|
||||||
/* remove the old cursor */
|
/* remove the old cursor */
|
||||||
sl = utf8size(term.line[oldy][oldx].c);
|
sl = utf8len(term.line[oldy][oldx].c);
|
||||||
width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1;
|
width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1;
|
||||||
xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx,
|
xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx,
|
||||||
oldy, width, sl);
|
oldy, width, sl);
|
||||||
|
@ -3444,7 +3392,7 @@ xdrawcursor(void) {
|
||||||
g.bg = defaultfg;
|
g.bg = defaultfg;
|
||||||
}
|
}
|
||||||
|
|
||||||
sl = utf8size(g.c);
|
sl = utf8len(g.c);
|
||||||
width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\
|
width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\
|
||||||
? 2 : 1;
|
? 2 : 1;
|
||||||
xdraws(g.c, g, term.c.x, term.c.y, width, sl);
|
xdraws(g.c, g, term.c.x, term.c.y, width, sl);
|
||||||
|
@ -3516,7 +3464,7 @@ drawregion(int x1, int y1, int x2, int y2) {
|
||||||
Glyph base, new;
|
Glyph base, new;
|
||||||
char buf[DRAW_BUF_SIZ];
|
char buf[DRAW_BUF_SIZ];
|
||||||
bool ena_sel = sel.ob.x != -1;
|
bool ena_sel = sel.ob.x != -1;
|
||||||
long u8char;
|
long unicodep;
|
||||||
|
|
||||||
if(sel.alt ^ IS_SET(MODE_ALTSCREEN))
|
if(sel.alt ^ IS_SET(MODE_ALTSCREEN))
|
||||||
ena_sel = 0;
|
ena_sel = 0;
|
||||||
|
@ -3548,7 +3496,7 @@ drawregion(int x1, int y1, int x2, int y2) {
|
||||||
base = new;
|
base = new;
|
||||||
}
|
}
|
||||||
|
|
||||||
sl = utf8decode(new.c, &u8char);
|
sl = utf8decode(new.c, &unicodep, UTF_SIZ);
|
||||||
memcpy(buf+ib, new.c, sl);
|
memcpy(buf+ib, new.c, sl);
|
||||||
ib += sl;
|
ib += sl;
|
||||||
ic += (new.mode & ATTR_WIDE)? 2 : 1;
|
ic += (new.mode & ATTR_WIDE)? 2 : 1;
|
||||||
|
@ -3707,7 +3655,7 @@ kpress(XEvent *ev) {
|
||||||
if(IS_SET(MODE_8BIT)) {
|
if(IS_SET(MODE_8BIT)) {
|
||||||
if(*buf < 0177) {
|
if(*buf < 0177) {
|
||||||
c = *buf | 0x80;
|
c = *buf | 0x80;
|
||||||
len = utf8encode(&c, buf);
|
len = utf8encode(c, buf, UTF_SIZ);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
buf[1] = buf[0];
|
buf[1] = buf[0];
|
||||||
|
|
Loading…
Reference in a new issue