import from plan9

This commit is contained in:
rsc 2006-05-21 18:57:51 +00:00
parent 44fc56d8c3
commit 536f9b83c0
9 changed files with 320 additions and 62 deletions

View file

@ -13,6 +13,8 @@ void uksc_in(int fd, long *notused, struct convert *out);
void uksc_out(Rune *base, int n, long *notused); void uksc_out(Rune *base, int n, long *notused);
void html_in(int fd, long *notused, struct convert *out); void html_in(int fd, long *notused, struct convert *out);
void html_out(Rune *base, int n, long *notused); void html_out(Rune *base, int n, long *notused);
void tune_in(int fd, long *notused, struct convert *out);
void tune_out(Rune *base, int n, long *notused);
#define emit(x) *(*r)++ = (x) #define emit(x) *(*r)++ = (x)
#define NRUNE 65536 #define NRUNE 65536

View file

@ -110,6 +110,7 @@ big5_in(int fd, long *notused, struct convert *out)
big5proc(-1, &r, nin); big5proc(-1, &r, nin);
if(r > ob) if(r > ob)
OUT(out, ob, r-ob); OUT(out, ob, r-ob);
OUT(out, ob, 0);
} }
void void

View file

@ -88,6 +88,7 @@ gb_in(int fd, long *notused, struct convert *out)
gbproc(-1, &r, nin); gbproc(-1, &r, nin);
if(r > ob) if(r > ob)
OUT(out, ob, r-ob); OUT(out, ob, r-ob);
OUT(out, ob, 0);
} }
void void

View file

@ -363,6 +363,7 @@ do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
(*procfn)(-1, &r, nin); (*procfn)(-1, &r, nin);
if(r > ob) if(r > ob)
OUT(out, ob, r-ob); OUT(out, ob, r-ob);
OUT(out, ob, 0);
} }
void void

View file

@ -109,6 +109,7 @@ uksc_in(int fd, long *notused, struct convert *out)
ukscproc(-1, &r, nin); ukscproc(-1, &r, nin);
if(r > ob) if(r > ob)
OUT(out, ob, r-ob); OUT(out, ob, r-ob);
OUT(out, ob, 0);
} }
void void

View file

@ -19,132 +19,251 @@ static Hchar byname[] =
{"Aacute", 193}, {"Aacute", 193},
{"Acirc", 194}, {"Acirc", 194},
{"Agrave", 192}, {"Agrave", 192},
{"Alpha", 913},
{"Aring", 197}, {"Aring", 197},
{"Atilde", 195}, {"Atilde", 195},
{"Auml", 196}, {"Auml", 196},
{"Beta", 914},
{"Ccedil", 199}, {"Ccedil", 199},
{"Chi", 935},
{"Dagger", 8225},
{"Delta", 916},
{"ETH", 208}, {"ETH", 208},
{"Eacute", 201}, {"Eacute", 201},
{"Ecirc", 202}, {"Ecirc", 202},
{"Egrave", 200}, {"Egrave", 200},
{"Epsilon", 917},
{"Eta", 919},
{"Euml", 203}, {"Euml", 203},
{"Gamma", 915},
{"Iacute", 205}, {"Iacute", 205},
{"Icirc", 206}, {"Icirc", 206},
{"Igrave", 204}, {"Igrave", 204},
{"Iota", 921},
{"Iuml", 207}, {"Iuml", 207},
{"Kappa", 922},
{"Lambda", 923},
{"Mu", 924},
{"Ntilde", 209}, {"Ntilde", 209},
{"Nu", 925},
{"OElig", 338},
{"Oacute", 211}, {"Oacute", 211},
{"Ocirc", 212}, {"Ocirc", 212},
{"Ograve", 210}, {"Ograve", 210},
{"Omega", 937},
{"Omicron", 927},
{"Oslash", 216}, {"Oslash", 216},
{"Otilde", 213}, {"Otilde", 213},
{"Ouml", 214}, {"Ouml", 214},
{"Phi", 934},
{"Pi", 928},
{"Prime", 8243},
{"Psi", 936},
{"Rho", 929},
{"Scaron", 352},
{"Sigma", 931},
{"THORN", 222}, {"THORN", 222},
{"Tau", 932},
{"Theta", 920},
{"Uacute", 218}, {"Uacute", 218},
{"Ucirc", 219}, {"Ucirc", 219},
{"Ugrave", 217}, {"Ugrave", 217},
{"Upsilon", 933},
{"Uuml", 220}, {"Uuml", 220},
{"Xi", 926},
{"Yacute", 221}, {"Yacute", 221},
{"Yuml", 376},
{"Zeta", 918},
{"aacute", 225}, {"aacute", 225},
{"acirc", 226}, {"acirc", 226},
{"acute", 180}, {"acute", 180},
{"aelig", 230}, {"aelig", 230},
{"agrave", 224}, {"agrave", 224},
{"alefsym", 8501},
{"alpha", 945}, {"alpha", 945},
{"amp", 38},
{"and", 8743},
{"ang", 8736},
{"aring", 229}, {"aring", 229},
{"asymp", 8776},
{"atilde", 227}, {"atilde", 227},
{"auml", 228}, {"auml", 228},
{"bdquo", 8222},
{"beta", 946}, {"beta", 946},
{"brvbar", 166}, {"brvbar", 166},
{"bull", 8226},
{"cap", 8745},
{"ccedil", 231}, {"ccedil", 231},
{"cdots", 8943}, {"cdots", 8943},
{"cedil", 184}, {"cedil", 184},
{"cent", 162}, {"cent", 162},
{"chi", 967}, {"chi", 967},
{"circ", 710},
{"clubs", 9827},
{"cong", 8773},
{"copy", 169}, {"copy", 169},
{"crarr", 8629},
{"cup", 8746},
{"curren", 164}, {"curren", 164},
{"dArr", 8659},
{"dagger", 8224},
{"darr", 8595},
{"ddots", 8945}, {"ddots", 8945},
{"deg", 176}, {"deg", 176},
{"delta", 948}, {"delta", 948},
{"diams", 9830},
{"divide", 247}, {"divide", 247},
{"eacute", 233}, {"eacute", 233},
{"ecirc", 234}, {"ecirc", 234},
{"egrave", 232}, {"egrave", 232},
{"emdash", 8212}, /* non-standard but commonly used */ {"emdash", 8212}, /* non-standard but commonly used */
{"empty", 8709},
{"emsp", 8195}, {"emsp", 8195},
{"endash", 8211}, /* non-standard but commonly used */ {"endash", 8211}, /* non-standard but commonly used */
{"ensp", 8194}, {"ensp", 8194},
{"epsilon", 949}, {"epsilon", 949},
{"equiv", 8801},
{"eta", 951}, {"eta", 951},
{"eth", 240}, {"eth", 240},
{"euml", 235}, {"euml", 235},
{"euro", 8364},
{"exist", 8707},
{"fnof", 402},
{"forall", 8704},
{"frac12", 189}, {"frac12", 189},
{"frac14", 188}, {"frac14", 188},
{"frac34", 190}, {"frac34", 190},
{"frasl", 8260},
{"gamma", 947}, {"gamma", 947},
{"ge", 8805},
{"gt", 62},
{"hArr", 8660},
{"harr", 8596},
{"hearts", 9829},
{"hellip", 8230},
{"iacute", 237}, {"iacute", 237},
{"icirc", 238}, {"icirc", 238},
{"iexcl", 161}, {"iexcl", 161},
{"igrave", 236}, {"igrave", 236},
{"image", 8465},
{"infin", 8734},
{"int", 8747},
{"iota", 953}, {"iota", 953},
{"iquest", 191}, {"iquest", 191},
{"isin", 8712},
{"iuml", 239}, {"iuml", 239},
{"kappa", 954}, {"kappa", 954},
{"lArr", 8656},
{"lambda", 955}, {"lambda", 955},
{"lang", 9001},
{"laquo", 171}, {"laquo", 171},
{"ldquo", 8220}, {"larr", 8592},
{"lceil", 8968},
{"ldots", 8230}, {"ldots", 8230},
{"ldquo", 8220},
{"le", 8804},
{"lfloor", 8970},
{"lowast", 8727},
{"loz", 9674},
{"lrm", 8206},
{"lsaquo", 8249},
{"lsquo", 8216}, {"lsquo", 8216},
{"lt", 60},
{"macr", 175}, {"macr", 175},
{"mdash", 8212}, {"mdash", 8212},
{"micro", 181}, {"micro", 181},
{"middot", 183}, {"middot", 183},
{"minus", 8722},
{"mu", 956}, {"mu", 956},
{"nabla", 8711},
{"nbsp", 160}, {"nbsp", 160},
{"ndash", 8211}, {"ndash", 8211},
{"ne", 8800},
{"ni", 8715},
{"not", 172}, {"not", 172},
{"notin", 8713},
{"nsub", 8836},
{"ntilde", 241}, {"ntilde", 241},
{"nu", 957}, {"nu", 957},
{"oacute", 243}, {"oacute", 243},
{"ocirc", 244}, {"ocirc", 244},
{"oelig", 339},
{"ograve", 242}, {"ograve", 242},
{"oline", 8254},
{"omega", 969}, {"omega", 969},
{"omicron", 959}, {"omicron", 959},
{"oplus", 8853},
{"or", 8744},
{"ordf", 170}, {"ordf", 170},
{"ordm", 186}, {"ordm", 186},
{"oslash", 248}, {"oslash", 248},
{"otilde", 245}, {"otilde", 245},
{"otimes", 8855},
{"ouml", 246}, {"ouml", 246},
{"para", 182}, {"para", 182},
{"part", 8706},
{"permil", 8240},
{"perp", 8869},
{"phi", 966}, {"phi", 966},
{"pi", 960}, {"pi", 960},
{"piv", 982},
{"plusmn", 177}, {"plusmn", 177},
{"pound", 163}, {"pound", 163},
{"prime", 8242},
{"prod", 8719},
{"prop", 8733},
{"psi", 968}, {"psi", 968},
{"quad", 8193}, {"quad", 8193},
{"quot", 34},
{"rArr", 8658},
{"radic", 8730},
{"rang", 9002},
{"raquo", 187}, {"raquo", 187},
{"rarr", 8594},
{"rceil", 8969},
{"rdquo", 8221}, {"rdquo", 8221},
{"real", 8476},
{"reg", 174}, {"reg", 174},
{"rfloor", 8971},
{"rho", 961}, {"rho", 961},
{"rlm", 8207},
{"rsaquo", 8250},
{"rsquo", 8217}, {"rsquo", 8217},
{"sbquo", 8218},
{"scaron", 353},
{"sdot", 8901},
{"sect", 167}, {"sect", 167},
{"shy", 173}, {"shy", 173},
{"sigma", 963}, {"sigma", 963},
{"sigmaf", 962},
{"sim", 8764},
{"sp", 8194}, {"sp", 8194},
{"spades", 9824},
{"sub", 8834},
{"sube", 8838},
{"sum", 8721},
{"sup", 8835},
{"sup1", 185}, {"sup1", 185},
{"sup2", 178}, {"sup2", 178},
{"sup3", 179}, {"sup3", 179},
{"supe", 8839},
{"szlig", 223}, {"szlig", 223},
{"tau", 964}, {"tau", 964},
{"there4", 8756},
{"theta", 952}, {"theta", 952},
{"thetasym", 977},
{"thinsp", 8201}, {"thinsp", 8201},
{"thorn", 254}, {"thorn", 254},
{"tilde", 732},
{"times", 215}, {"times", 215},
{"trade", 8482}, {"trade", 8482},
{"uArr", 8657},
{"uacute", 250}, {"uacute", 250},
{"uarr", 8593},
{"ucirc", 251}, {"ucirc", 251},
{"ugrave", 249}, {"ugrave", 249},
{"uml", 168}, {"uml", 168},
{"upsih", 978},
{"upsilon", 965}, {"upsilon", 965},
{"uuml", 252}, {"uuml", 252},
{"varepsilon", 8712}, {"varepsilon", 8712},
@ -154,11 +273,14 @@ static Hchar byname[] =
{"vdots", 8942}, {"vdots", 8942},
{"vsigma", 962}, {"vsigma", 962},
{"vtheta", 977}, {"vtheta", 977},
{"weierp", 8472},
{"xi", 958}, {"xi", 958},
{"yacute", 253}, {"yacute", 253},
{"yen", 165}, {"yen", 165},
{"yuml", 255}, {"yuml", 255},
{"zeta", 950} {"zeta", 950},
{"zwj", 8205},
{"zwnj", 8204}
}; };
static Hchar byrune[nelem(byname)]; static Hchar byrune[nelem(byname)];
@ -302,6 +424,7 @@ html_in(int fd, long *x, struct convert *out)
} }
if(r > rbuf) if(r > rbuf)
OUT(out, rbuf, r-rbuf); OUT(out, rbuf, r-rbuf);
OUT(out, rbuf, 0);
} }
/* /*
@ -314,6 +437,7 @@ html_out(Rune *r, int n, long *x)
Biobuf b; Biobuf b;
Rune *er; Rune *er;
USED(x);
html_init(); html_init();
Binit(&b, 1, OWRITE); Binit(&b, 1, OWRITE);
er = r+n; er = r+n;

View file

@ -11,7 +11,8 @@ OFILES=tcs.$O\
kuten208.$O\ kuten208.$O\
gb.$O\ gb.$O\
ksc.$O\ ksc.$O\
big5.$O big5.$O\
tune.$O\
<$PLAN9/src/mkone <$PLAN9/src/mkone
CFLAGS= -DPLAN9 $CFLAGS CFLAGS= -DPLAN9 $CFLAGS
@ -23,6 +24,9 @@ tcs.$O big5.$O: big5.h
tcs.$O gb.$O: gb.h tcs.$O gb.$O: gb.h
tcs.$O: cyrillic.h tcs.$O: cyrillic.h
tcs.$O: conv.h tcs.$O: conv.h
tcs.$O: 8859.h
tcs.$O: ms.h
tcs.$O: misc.h
conv%.$O: conv.h conv%.$O: conv.h
conv_ksc.$O: ksc.h conv_ksc.$O: ksc.h

View file

@ -54,7 +54,7 @@ main(int argc, char **argv)
clean = 1; clean = 1;
break; break;
case 'f': case 'f':
from = ARGF(); from = EARGF(usage());
break; break;
case 'l': case 'l':
listem = 1; listem = 1;
@ -63,7 +63,7 @@ main(int argc, char **argv)
squawk = 0; squawk = 0;
break; break;
case 't': case 't':
to = ARGF(); to = EARGF(usage());
break; break;
case 'v': case 'v':
verbose = 1; verbose = 1;
@ -160,7 +160,7 @@ conv(char *name, int from)
struct convert *c; struct convert *c;
for(c = convert; c->name; c++){ for(c = convert; c->name; c++){
if(strcmp(c->name, name) != 0) if(cistrcmp(c->name, name) != 0)
continue; continue;
if(c->flags&Table) if(c->flags&Table)
return(c); return(c);
@ -208,25 +208,81 @@ unicode_in(int fd, long *notused, struct convert *out)
} }
while((n = read(fd, (char *)buf, 2*N)) > 0){ while((n = read(fd, (char *)buf, 2*N)) > 0){
ninput += n; ninput += n;
if(swabme)
swab2((char *)buf, n);
if(n&1){ if(n&1){
if(squawk) if(squawk)
EPR "%s: odd byte count in %s\n", argv0, file); EPR "%s: odd byte count in %s\n", argv0, file);
nerrors++; nerrors++;
if(clean) if(clean)
n--; n--;
else { else
n++; buf[n++/2] = Runeerror;
buf[n/2] = Runeerror;
if(swabme) /* swab so later swab undoes it */
swab2((char *)&buf[n/2], 2);
}
} }
if(swabme)
swab2((char *)buf, n);
OUT(out, buf, n/2); OUT(out, buf, n/2);
} }
} }
void
unicode_in_be(int fd, long *notused, struct convert *out)
{
int i, n;
Rune buf[N], r;
uchar *p;
USED(notused);
while((n = read(fd, (char *)buf, 2*N)) > 0){
ninput += n;
p = (uchar*)buf;
for(i=0; i<n/2; i++){
r = *p++<<8;
r |= *p++;
buf[i] = r;
}
if(n&1){
if(squawk)
EPR "%s: odd byte count in %s\n", argv0, file);
nerrors++;
if(clean)
n--;
else
buf[n++/2] = Runeerror;
}
OUT(out, buf, n/2);
}
OUT(out, buf, 0);
}
void
unicode_in_le(int fd, long *notused, struct convert *out)
{
int i, n;
Rune buf[N], r;
uchar *p;
USED(notused);
while((n = read(fd, (char *)buf, 2*N)) > 0){
ninput += n;
p = (uchar*)buf;
for(i=0; i<n/2; i++){
r = *p++;
r |= *p++<<8;
buf[i] = r;
}
if(n&1){
if(squawk)
EPR "%s: odd byte count in %s\n", argv0, file);
nerrors++;
if(clean)
n--;
else
buf[n++/2] = Runeerror;
}
OUT(out, buf, n/2);
}
OUT(out, buf, 0);
}
void void
unicode_out(Rune *base, int n, long *notused) unicode_out(Rune *base, int n, long *notused)
{ {
@ -244,6 +300,44 @@ unicode_out(Rune *base, int n, long *notused)
write(1, (char *)base, 2*n); write(1, (char *)base, 2*n);
} }
void
unicode_out_be(Rune *base, int n, long *notused)
{
int i;
uchar *p;
Rune r;
USED(notused);
p = (uchar*)base;
for(i=0; i<n; i++){
r = base[i];
*p++ = r>>8;
*p++ = r;
}
nrunes += n;
noutput += 2*n;
write(1, (char *)base, 2*n);
}
void
unicode_out_le(Rune *base, int n, long *notused)
{
int i;
uchar *p;
Rune r;
USED(notused);
p = (uchar*)base;
for(i=0; i<n; i++){
r = base[i];
*p++ = r;
*p++ = r>>8;
}
nrunes += n;
noutput += 2*n;
write(1, (char *)base, 2*n);
}
void void
intable(int fd, long *table, struct convert *out) intable(int fd, long *table, struct convert *out)
{ {
@ -270,6 +364,7 @@ intable(int fd, long *table, struct convert *out)
} }
OUT(out, runes, r-runes); OUT(out, runes, r-runes);
} }
OUT(out, runes, 0);
if(n < 0){ if(n < 0){
#ifdef PLAN9 #ifdef PLAN9
EPR "%s: input read: %r\n", argv0); EPR "%s: input read: %r\n", argv0);
@ -403,64 +498,91 @@ struct convert convert[] =
{ "av", "Alternativnyj Variant", Table, (void *)tabav }, { "av", "Alternativnyj Variant", Table, (void *)tabav },
{ "big5", "Big 5 (HKU)", From|Func, 0, (Fnptr)big5_in }, { "big5", "Big 5 (HKU)", From|Func, 0, (Fnptr)big5_in },
{ "big5", "Big 5 (HKU)", Func, 0, (Fnptr)big5_out }, { "big5", "Big 5 (HKU)", Func, 0, (Fnptr)big5_out },
{ "cp437", "Code Page 437 (US)", Table, (void*)tabcp437 },
{ "cp720", "Code Page 720 (Arabic)", Table, (void*)tabcp720 },
{ "cp737", "Code Page 737 (Greek)", Table, (void*)tabcp737 },
{ "cp775", "Code Page 775 (Baltic)", Table, (void*)tabcp775 },
{ "cp850", "Code Page 850 (Multilingual Latin I)", Table, (void*)tabcp850 },
{ "cp852", "Code Page 852 (Latin II)", Table, (void*)tabcp852 },
{ "cp855", "Code Page 855 (Cyrillic)", Table, (void*)tabcp855 },
{ "cp857", "Code Page 857 (Turkish)", Table, (void*)tabcp857 },
{ "cp858", "Code Page 858 (Multilingual Latin I+Euro)", Table, (void*)tabcp858 },
{ "cp862", "Code Page 862 (Hebrew)", Table, (void*)tabcp862 },
{ "cp866", "Code Page 866 (Russian)", Table, (void*)tabcp866 },
{ "cp874", "Code Page 874 (Thai)", Table, (void*)tabcp874 },
{ "cp1250", "Code Page 1250 (Central Europe)", Table, (void *)tabcp1250 },
{ "cp1251", "Code Page 1251 (Cyrillic)", Table, (void *)tabcp1251 },
{ "cp1252", "Code Page 1252 (Latin I)", Table, (void *)tabcp1252 },
{ "cp1253", "Code Page 1253 (Greek)", Table, (void *)tabcp1253 },
{ "cp1254", "Code Page 1254 (Turkish)", Table, (void *)tabcp1254 },
{ "cp1255", "Code Page 1255 (Hebrew)", Table, (void *)tabcp1255 },
{ "cp1256", "Code Page 1256 (Arabic)", Table, (void *)tabcp1256 },
{ "cp1257", "Code Page 1257 (Baltic)", Table, (void *)tabcp1257 },
{ "cp1258", "Code Page 1258 (Vietnam)", Table, (void *)tabcp1258 },
{ "ebcdic", "EBCDIC", Table, (void *)tabebcdic }, /* 6f is recommended bad map */ { "ebcdic", "EBCDIC", Table, (void *)tabebcdic }, /* 6f is recommended bad map */
{ "euc-k", "Korean EUC: ASCII+KS C 5601 1987", From|Func, 0, (Fnptr)uksc_in }, { "euc-k", "Korean EUC: ASCII+KS C 5601 1987", From|Func, 0, (Fnptr)uksc_in },
{ "euc-k", "Korean EUC: ASCII+KS C 5601 1987", Func, 0, (Fnptr)uksc_out }, { "euc-k", "Korean EUC: ASCII+KS C 5601 1987", Func, 0, (Fnptr)uksc_out },
{ "gb", "GB2312-80 (Chinese)", From|Func, 0, (Fnptr)gb_in }, { "gb2312", "GB2312-80 (Chinese)", From|Func, 0, (Fnptr)gb_in },
{ "gb", "GB2312-80 (Chinese)", Func, 0, (Fnptr)gb_out }, { "gb2312", "GB2312-80 (Chinese)", Func, 0, (Fnptr)gb_out },
{ "html", "HTML", From|Func, 0, (Fnptr)html_in }, { "html", "HTML", From|Func, 0, (Fnptr)html_in },
{ "html", "HTML", Func, 0, (Fnptr)html_out }, { "html", "HTML", Func, 0, (Fnptr)html_out },
{ "ibm437", "IBM Code Page 437 (US)", Table, (void*)tabcp437 },
{ "ibm720", "IBM Code Page 720 (Arabic)", Table, (void*)tabcp720 },
{ "ibm737", "IBM Code Page 737 (Greek)", Table, (void*)tabcp737 },
{ "ibm775", "IBM Code Page 775 (Baltic)", Table, (void*)tabcp775 },
{ "ibm850", "IBM Code Page 850 (Multilingual Latin I)", Table, (void*)tabcp850 },
{ "ibm852", "IBM Code Page 852 (Latin II)", Table, (void*)tabcp852 },
{ "ibm855", "IBM Code Page 855 (Cyrillic)", Table, (void*)tabcp855 },
{ "ibm857", "IBM Code Page 857 (Turkish)", Table, (void*)tabcp857 },
{ "ibm858", "IBM Code Page 858 (Multilingual Latin I+Euro)", Table, (void*)tabcp858 },
{ "ibm862", "IBM Code Page 862 (Hebrew)", Table, (void*)tabcp862 },
{ "ibm866", "IBM Code Page 866 (Russian)", Table, (void*)tabcp866 },
{ "ibm874", "IBM Code Page 874 (Thai)", Table, (void*)tabcp874 },
{ "iso-2022-jp", "alias for jis-kanji (MIME)", From|Func, 0, (Fnptr)jisjis_in },
{ "iso-2022-jp", "alias for jis-kanji (MIME)", Func, 0, (Fnptr)jisjis_out },
{ "iso-8859-1", "alias for 8859-1 (MIME)", Table, (void *)tab8859_1 },
{ "iso-8859-2", "alias for 8859-2 (MIME)", Table, (void *)tab8859_2 },
{ "iso-8859-3", "alias for 8859-3 (MIME)", Table, (void *)tab8859_3 },
{ "iso-8859-4", "alias for 8859-4 (MIME)", Table, (void *)tab8859_4 },
{ "iso-8859-5", "alias for 8859-5 (MIME)", Table, (void *)tab8859_5 },
{ "iso-8859-6", "alias for 8859-6 (MIME)", Table, (void *)tab8859_6 },
{ "iso-8859-7", "alias for 8859-7 (MIME)", Table, (void *)tab8859_7 },
{ "iso-8859-8", "alias for 8859-8 (MIME)", Table, (void *)tab8859_8 },
{ "iso-8859-9", "alias for 8859-9 (MIME)", Table, (void *)tab8859_9 },
{ "iso-8859-10", "alias for 8859-10 (MIME)", Table, (void *)tab8859_10 },
{ "iso-8859-15", "alias for 8859-15 (MIME)", Table, (void *)tab8859_15 },
{ "jis", "guesses at the JIS encoding", From|Func, 0, (Fnptr)jis_in }, { "jis", "guesses at the JIS encoding", From|Func, 0, (Fnptr)jis_in },
{ "jis-kanji", "ISO 2022-JP (Japanese)", From|Func, 0, (Fnptr)jisjis_in }, { "jis-kanji", "ISO 2022-JP (Japanese)", From|Func, 0, (Fnptr)jisjis_in },
{ "jis-kanji", "ISO 2022-JP (Japanese)", Func, 0, (Fnptr)jisjis_out }, { "jis-kanji", "ISO 2022-JP (Japanese)", Func, 0, (Fnptr)jisjis_out },
{ "koi8", "KOI-8 (GOST 19769-74)", Table, (void *)tabkoi8 }, { "koi8", "KOI-8 (GOST 19769-74)", Table, (void *)tabkoi8 },
{ "latin1", "ISO 8859-1", Table, (void *)tab8859_1 }, { "koi8-r", "alias for koi8 (MIME)", Table, (void *)tabkoi8 },
{ "latin1", "alias for 8859-1", Table, (void *)tab8859_1 },
{ "macrom", "Macintosh Standard Roman character set", Table, (void *)tabmacroman }, { "macrom", "Macintosh Standard Roman character set", Table, (void *)tabmacroman },
{ "microsoft", "Windows (CP 1252)", Table, (void *)tabcp1252 }, { "microsoft", "alias for windows1252", Table, (void *)tabcp1252 },
{ "msdos", "IBM PC (CP 437)", Table, (void *)tabcp437 },
{ "msdos2", "IBM PC (CP 437 with graphics in C0)", Table, (void *)tabmsdos2 },
{ "ms-kanji", "Microsoft, or Shift-JIS", From|Func, 0, (Fnptr)msjis_in }, { "ms-kanji", "Microsoft, or Shift-JIS", From|Func, 0, (Fnptr)msjis_in },
{ "ms-kanji", "Microsoft, or Shift-JIS", Func, 0, (Fnptr)msjis_out }, { "ms-kanji", "Microsoft, or Shift-JIS", Func, 0, (Fnptr)msjis_out },
{ "msdos", "IBM PC (alias for ibm437)", Table, (void *)tabcp437 },
{ "msdos2", "IBM PC (ibm437 with graphics in C0)", Table, (void *)tabmsdos2 },
{ "next", "NEXTSTEP character set", Table, (void *)tabnextstep }, { "next", "NEXTSTEP character set", Table, (void *)tabnextstep },
{ "ov", "Osnovnoj Variant", Table, (void *)tabov }, { "ov", "Osnovnoj Variant", Table, (void *)tabov },
{ "ps2", "IBM PS/2: (CP 850)", Table, (void *)tabcp850 }, { "ps2", "IBM PS/2: (alias for ibm850)", Table, (void *)tabcp850 },
{ "sf1", "ISO-646: Finnish/Swedish SF-1 variant", Table, (void *)tabsf1 }, { "sf1", "ISO-646: Finnish/Swedish SF-1 variant", Table, (void *)tabsf1 },
{ "sf2", "ISO-646: Finnish/Swedish SF-2 variant (recommended)", Table, (void *)tabsf2 }, { "sf2", "ISO-646: Finnish/Swedish SF-2 variant (recommended)", Table, (void *)tabsf2 },
{ "tis", "Thai+ASCII (TIS 620-1986)", Table, (void *)tabtis620 }, { "tis-620", "Thai+ASCII (TIS 620-1986)", Table, (void *)tabtis620 },
{ "tune", "TUNE (Tamil)", From|Func, 0, (Fnptr)tune_in },
{ "tune", "TUNE (Tamil)", Func, 0, (Fnptr)tune_out },
{ "ucode", "Russian U-code", Table, (void *)tabucode }, { "ucode", "Russian U-code", Table, (void *)tabucode },
{ "ujis", "EUC-JX: JIS 0208", From|Func, 0, (Fnptr)ujis_in }, { "ujis", "EUC-JX: JIS 0208", From|Func, 0, (Fnptr)ujis_in },
{ "ujis", "EUC-JX: JIS 0208", Func, 0, (Fnptr)ujis_out }, { "ujis", "EUC-JX: JIS 0208", Func, 0, (Fnptr)ujis_out },
{ "unicode", "Unicode 1.1", From|Func, 0, (Fnptr)unicode_in }, { "unicode", "Unicode 1.1", From|Func, 0, (Fnptr)unicode_in },
{ "unicode", "Unicode 1.1", Func, 0, (Fnptr)unicode_out }, { "unicode", "Unicode 1.1", Func, 0, (Fnptr)unicode_out },
{ "utf1", "UTF-1 (ISO 10646 Annex A)", From|Func, 0, (Fnptr)isoutf_in }, { "unicode-be", "Unicode 1.1 big-endian", From|Func, 0, (Fnptr)unicode_in_be },
{ "utf1", "UTF-1 (ISO 10646 Annex A)", Func, 0, (Fnptr)isoutf_out }, { "unicode-be", "Unicode 1.1 big-endian", Func, 0, (Fnptr)unicode_out_be },
{ "unicode-le", "Unicode 1.1 little-endian", From|Func, 0, (Fnptr)unicode_in_le },
{ "unicode-le", "Unicode 1.1 little-endian", Func, 0, (Fnptr)unicode_out_le },
{ "us-ascii", "alias for ascii (MIME)", Table, (void *)tabascii },
{ "utf", "FSS-UTF a.k.a. UTF-8", From|Func, 0, (Fnptr)utf_in }, { "utf", "FSS-UTF a.k.a. UTF-8", From|Func, 0, (Fnptr)utf_in },
{ "utf", "FSS-UTF a.k.a. UTF-8", Func, 0, (Fnptr)utf_out }, { "utf", "FSS-UTF a.k.a. UTF-8", Func, 0, (Fnptr)utf_out },
{ "utf-l2", "from", From|Func, 0, (Fnptr)utf_in }, { "utf1", "UTF-1 (ISO 10646 Annex A)", From|Func, 0, (Fnptr)isoutf_in },
{ "utf-l2", "to", Func, 0, (Fnptr)utf_out }, { "utf1", "UTF-1 (ISO 10646 Annex A)", Func, 0, (Fnptr)isoutf_out },
{ "utf-8", "alias for utf (MIME)", From|Func, 0, (Fnptr)utf_in },
{ "utf-8", "alias for utf (MIME)", Func, 0, (Fnptr)utf_out },
{ "utf-16", "alias for unicode (MIME)", From|Func, 0, (Fnptr)unicode_in },
{ "utf-16", "alias for unicode (MIME)", Func, 0, (Fnptr)unicode_out },
{ "utf-16be", "alias for unicode-be (MIME)", From|Func, 0, (Fnptr)unicode_in_be },
{ "utf-16be", "alias for unicode-be (MIME)", Func, 0, (Fnptr)unicode_out_be },
{ "utf-16le", "alias for unicode-le (MIME)", From|Func, 0, (Fnptr)unicode_in_le },
{ "utf-16le", "alias for unicode-le (MIME)", Func, 0, (Fnptr)unicode_out_le },
{ "viet1", "Vietnamese VSCII-1 (1993)", Table, (void *)tabviet1 }, { "viet1", "Vietnamese VSCII-1 (1993)", Table, (void *)tabviet1 },
{ "viet2", "Vietnamese VSCII-2 (1993)", Table, (void *)tabviet2 }, { "viet2", "Vietnamese VSCII-2 (1993)", Table, (void *)tabviet2 },
{ "viscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii }, { "vscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii },
{ "windows-1250", "Windows Code Page 1250 (Central Europe)", Table, (void *)tabcp1250 },
{ "windows-1251", "Windows Code Page 1251 (Cyrillic)", Table, (void *)tabcp1251 },
{ "windows-1252", "Windows Code Page 1252 (Latin I)", Table, (void *)tabcp1252 },
{ "windows-1253", "Windows Code Page 1253 (Greek)", Table, (void *)tabcp1253 },
{ "windows-1254", "Windows Code Page 1254 (Turkish)", Table, (void *)tabcp1254 },
{ "windows-1255", "Windows Code Page 1255 (Hebrew)", Table, (void *)tabcp1255 },
{ "windows-1256", "Windows Code Page 1256 (Arabic)", Table, (void *)tabcp1256 },
{ "windows-1257", "Windows Code Page 1257 (Baltic)", Table, (void *)tabcp1257 },
{ "windows-1258", "Windows Code Page 1258 (Vietnam)", Table, (void *)tabcp1258 },
{ 0 } { 0 }
}; };

View file

@ -45,15 +45,15 @@ utf_in(int fd, long *notused, struct convert *out)
tot = 0; tot = 0;
while((n = read(fd, buf+tot, N-tot)) >= 0){ while((n = read(fd, buf+tot, N-tot)) >= 0){
tot += n; tot += n;
for(i=j=0; i<tot; ){ for(i=j=0; i<tot-UTFmax || (n==0 && i<tot); ){
c = our_mbtowc(&l, buf+i, tot-i); c = our_mbtowc(&l, buf+i, tot-i);
if(c == -2)
break;
if(c == -1){ if(c == -1){
if(squawk) if(squawk)
EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
if(clean) if(clean){
i++;
continue; continue;
}
nerrors++; nerrors++;
l = Runeerror; l = Runeerror;
c = 1; c = 1;
@ -69,6 +69,7 @@ utf_in(int fd, long *notused, struct convert *out)
if(n == 0) if(n == 0)
break; break;
} }
OUT(out, runes, 0);
} }
void void
@ -100,11 +101,13 @@ isoutf_in(int fd, long *notused, struct convert *out)
if(!fullisorune(buf+i, tot-i)) if(!fullisorune(buf+i, tot-i))
break; break;
c = isochartorune(&runes[j], buf+i); c = isochartorune(&runes[j], buf+i);
if(runes[j] == Runeerror){ if(runes[j] == Runeerror && c == 1){
if(squawk) if(squawk)
EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
if(clean) if(clean){
i++;
continue; continue;
}
nerrors++; nerrors++;
} }
j++; j++;
@ -118,6 +121,7 @@ isoutf_in(int fd, long *notused, struct convert *out)
if(n == 0) if(n == 0)
break; break;
} }
OUT(out, runes, 0);
} }
void void
@ -393,19 +397,19 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
return 0; /* no shift states */ return 0; /* no shift states */
if(n < 1) if(n < 1)
goto badlen; goto bad;
us = (uchar*)s; us = (uchar*)s;
c0 = us[0]; c0 = us[0];
if(c0 >= T3) { if(c0 >= T3) {
if(n < 3) if(n < 3)
goto badlen; goto bad;
c1 = us[1] ^ Tx; c1 = us[1] ^ Tx;
c2 = us[2] ^ Tx; c2 = us[2] ^ Tx;
if((c1|c2) & T2) if((c1|c2) & T2)
goto bad; goto bad;
if(c0 >= T5) { if(c0 >= T5) {
if(n < 5) if(n < 5)
goto badlen; goto bad;
c3 = us[3] ^ Tx; c3 = us[3] ^ Tx;
c4 = us[4] ^ Tx; c4 = us[4] ^ Tx;
if((c3|c4) & T2) if((c3|c4) & T2)
@ -413,7 +417,7 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
if(c0 >= T6) { if(c0 >= T6) {
/* 6 bytes */ /* 6 bytes */
if(n < 6) if(n < 6)
goto badlen; goto bad;
c5 = us[5] ^ Tx; c5 = us[5] ^ Tx;
if(c5 & T2) if(c5 & T2)
goto bad; goto bad;
@ -437,7 +441,7 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
if(c0 >= T4) { if(c0 >= T4) {
/* 4 bytes */ /* 4 bytes */
if(n < 4) if(n < 4)
goto badlen; goto bad;
c3 = us[3] ^ Tx; c3 = us[3] ^ Tx;
if(c3 & T2) if(c3 & T2)
goto bad; goto bad;
@ -460,7 +464,7 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
if(c0 >= T2) { if(c0 >= T2) {
/* 2 bytes */ /* 2 bytes */
if(n < 2) if(n < 2)
goto badlen; goto bad;
c1 = us[1] ^ Tx; c1 = us[1] ^ Tx;
if(c1 & T2) if(c1 & T2)
goto bad; goto bad;
@ -480,6 +484,4 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
bad: bad:
errno = EILSEQ; errno = EILSEQ;
return -1; return -1;
badlen:
return -2;
} }