--- html.c.old 2009-10-05 12:38:12.000000000 +0900
+++ html.c.new 2009-10-09 10:21:06.000000000 +0900
@@ -519,95 +519,183 @@
case cs_utf_8:
{
unsigned long utf = 0;
- int stat = 0;
- int more = 1;
- /* unpack utf-8 encoding into a wide char.
- * Code stolen from the mbstring extension */
+ if (this_char >= 0xc2 && this_char <= 0xdf) {
+ utf = (this_char & 0x1f) << 6;
+ CHECK_LEN(pos, 1);
- do {
- if (this_char < 0x80) {
- more = 0;
- if(stat) {
- /* we didn't finish the UTF sequence correctly */
- --pos;
- *status = FAILURE;
- }
- break;
- } else if (this_char < 0xc0) {
- switch (stat) {
- case 0x10: /* 2, 2nd */
- case 0x21: /* 3, 3rd */
- case 0x32: /* 4, 4th */
- case 0x43: /* 5, 5th */
- case 0x54: /* 6, 6th */
- /* last byte in sequence */
- more = 0;
- utf |= (this_char & 0x3f);
- this_char = (unsigned short)utf;
- break;
- case 0x20: /* 3, 2nd */
- case 0x31: /* 4, 3rd */
- case 0x42: /* 5, 4th */
- case 0x53: /* 6, 5th */
- /* penultimate char */
- utf |= ((this_char & 0x3f) << 6);
- stat++;
- break;
- case 0x30: /* 4, 2nd */
- case 0x41: /* 5, 3rd */
- case 0x52: /* 6, 4th */
- utf |= ((this_char & 0x3f) << 12);
- stat++;
- break;
- case 0x40: /* 5, 2nd */
- case 0x51:
- utf |= ((this_char & 0x3f) << 18);
- stat++;
- break;
- case 0x50: /* 6, 2nd */
- utf |= ((this_char & 0x3f) << 24);
- stat++;
- break;
- default:
- /* invalid */
- *status = FAILURE;
- more = 0;
- }
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
}
- /* lead byte */
- else if (this_char < 0xe0) {
- stat = 0x10; /* 2 byte */
- utf = (this_char & 0x1f) << 6;
- CHECK_LEN(pos, 1);
- } else if (this_char < 0xf0) {
- stat = 0x20; /* 3 byte */
- utf = (this_char & 0xf) << 12;
- CHECK_LEN(pos, 2);
- } else if (this_char < 0xf8) {
- stat = 0x30; /* 4 byte */
- utf = (this_char & 0x7) << 18;
- CHECK_LEN(pos, 3);
- } else if (this_char < 0xfc) {
- stat = 0x40; /* 5 byte */
- utf = (this_char & 0x3) << 24;
- CHECK_LEN(pos, 4);
- } else if (this_char < 0xfe) {
- stat = 0x50; /* 6 byte */
- utf = (this_char & 0x1) << 30;
- CHECK_LEN(pos, 5);
+
+ } else if (this_char == 0xe0) {
+ utf = (this_char & 0xf) << 12;
+ CHECK_LEN(pos, 2);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0xa0 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if ((this_char >= 0xe1 && this_char <= 0xec) ||
+ (this_char >= 0xee && this_char <= 0xef)) {
+ utf = (this_char & 0xf) << 12;
+ CHECK_LEN(pos, 2);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if (this_char == 0xed) {
+ utf = (this_char & 0xf) << 12;
+ CHECK_LEN(pos, 2);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0xa0 && this_char <= 0x9f) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if (this_char == 0xf0) {
+ utf = (this_char & 0x7) << 18;
+ CHECK_LEN(pos, 3);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x90 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 12);
+ this_char = (unsigned short)utf;
} else {
- /* invalid; bail */
- more = 0;
*status = FAILURE;
- break;
}
- if (more) {
- this_char = str[pos++];
- MB_WRITE((unsigned char)this_char);
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if (this_char >= 0xf1 && this_char <= 0xf3) {
+ utf = (this_char & 0x7) << 18;
+ CHECK_LEN(pos, 3);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 12);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if (this_char == 0xf4) {
+ utf = (this_char & 0x7) << 18;
+ CHECK_LEN(pos, 3);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0x8f) {
+ utf |= ((this_char & 0x3f) << 12);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
}
- } while (more);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if (!(this_char >= 0x00 && this_char <= 0x7f)) {
+ *status = FAILURE;
+ }
}
break;
case cs_big5:
@@ -648,8 +736,13 @@
MB_WRITE(next_char);
this_char |= next_char;
pos++;
+ } else {
+ *status = FAILURE;
}
-
+ } else if (!(this_char >= 0x00 && this_char <= 0x7f) &&
+ !(this_char >= 0xa1 && this_char <= 0xdf))
+ {
+ *status = FAILURE;
}
break;
}
@@ -666,8 +759,9 @@
MB_WRITE(next_char);
this_char |= next_char;
pos++;
+ } else {
+ *status = FAILURE;
}
-
} else if (this_char == 0x8e) {
/* peek at the next char */
CHECK_LEN(pos, 1);
@@ -678,8 +772,9 @@
MB_WRITE(next_char);
this_char |= next_char;
pos++;
+ } else {
+ *status = FAILURE;
}
-
} else if (this_char == 0x8f) {
/* peek at the next two char */
unsigned char next2_char;
@@ -697,8 +792,11 @@
MB_WRITE(next2_char);
this_char |= next2_char;
pos++;
+ } else {
+ *status = FAILURE;
}
-
+ } else if (!(this_char >= 0x00 && this_char <= 0x7f)) {
+ *status = FAILURE;
}
break;
}