--- html.c.old 2009-10-05 12:38:12.000000000 +0900 +++ html.c.new 2009-10-09 10:21:06.000000000 +0900 @@ -519,95 +519,183 @@ case cs_utf_8: { unsigned long utf = 0; - int stat = 0; - int more = 1; - /* unpack utf-8 encoding into a wide char. - * Code stolen from the mbstring extension */ + if (this_char >= 0xc2 && this_char <= 0xdf) { + utf = (this_char & 0x1f) << 6; + CHECK_LEN(pos, 1); - do { - if (this_char < 0x80) { - more = 0; - if(stat) { - /* we didn't finish the UTF sequence correctly */ - --pos; - *status = FAILURE; - } - break; - } else if (this_char < 0xc0) { - switch (stat) { - case 0x10: /* 2, 2nd */ - case 0x21: /* 3, 3rd */ - case 0x32: /* 4, 4th */ - case 0x43: /* 5, 5th */ - case 0x54: /* 6, 6th */ - /* last byte in sequence */ - more = 0; - utf |= (this_char & 0x3f); - this_char = (unsigned short)utf; - break; - case 0x20: /* 3, 2nd */ - case 0x31: /* 4, 3rd */ - case 0x42: /* 5, 4th */ - case 0x53: /* 6, 5th */ - /* penultimate char */ - utf |= ((this_char & 0x3f) << 6); - stat++; - break; - case 0x30: /* 4, 2nd */ - case 0x41: /* 5, 3rd */ - case 0x52: /* 6, 4th */ - utf |= ((this_char & 0x3f) << 12); - stat++; - break; - case 0x40: /* 5, 2nd */ - case 0x51: - utf |= ((this_char & 0x3f) << 18); - stat++; - break; - case 0x50: /* 6, 2nd */ - utf |= ((this_char & 0x3f) << 24); - stat++; - break; - default: - /* invalid */ - *status = FAILURE; - more = 0; - } + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= (this_char & 0x3f); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; } - /* lead byte */ - else if (this_char < 0xe0) { - stat = 0x10; /* 2 byte */ - utf = (this_char & 0x1f) << 6; - CHECK_LEN(pos, 1); - } else if (this_char < 0xf0) { - stat = 0x20; /* 3 byte */ - utf = (this_char & 0xf) << 12; - CHECK_LEN(pos, 2); - } else if (this_char < 0xf8) { - stat = 0x30; /* 4 byte */ - utf = (this_char & 0x7) << 18; - CHECK_LEN(pos, 3); - } else if (this_char < 0xfc) { - stat = 0x40; /* 5 byte */ - utf = (this_char & 0x3) << 24; - CHECK_LEN(pos, 4); - } else if (this_char < 0xfe) { - stat = 0x50; /* 6 byte */ - utf = (this_char & 0x1) << 30; - CHECK_LEN(pos, 5); + + } else if (this_char == 0xe0) { + utf = (this_char & 0xf) << 12; + CHECK_LEN(pos, 2); + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0xa0 && this_char <= 0xbf) { + utf |= ((this_char & 0x3f) << 6); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= (this_char & 0x3f); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + } else if ((this_char >= 0xe1 && this_char <= 0xec) || + (this_char >= 0xee && this_char <= 0xef)) { + utf = (this_char & 0xf) << 12; + CHECK_LEN(pos, 2); + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= ((this_char & 0x3f) << 6); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= (this_char & 0x3f); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + } else if (this_char == 0xed) { + utf = (this_char & 0xf) << 12; + CHECK_LEN(pos, 2); + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0xa0 && this_char <= 0x9f) { + utf |= ((this_char & 0x3f) << 6); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= (this_char & 0x3f); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + } else if (this_char == 0xf0) { + utf = (this_char & 0x7) << 18; + CHECK_LEN(pos, 3); + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x90 && this_char <= 0xbf) { + utf |= ((this_char & 0x3f) << 12); + this_char = (unsigned short)utf; } else { - /* invalid; bail */ - more = 0; *status = FAILURE; - break; } - if (more) { - this_char = str[pos++]; - MB_WRITE((unsigned char)this_char); + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= ((this_char & 0x3f) << 6); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= (this_char & 0x3f); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + } else if (this_char >= 0xf1 && this_char <= 0xf3) { + utf = (this_char & 0x7) << 18; + CHECK_LEN(pos, 3); + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= ((this_char & 0x3f) << 12); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= ((this_char & 0x3f) << 6); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= (this_char & 0x3f); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + } else if (this_char == 0xf4) { + utf = (this_char & 0x7) << 18; + CHECK_LEN(pos, 3); + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0x8f) { + utf |= ((this_char & 0x3f) << 12); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; } - } while (more); + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= ((this_char & 0x3f) << 6); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + this_char = str[pos++]; + MB_WRITE((unsigned char)this_char); + if (this_char >= 0x80 && this_char <= 0xbf) { + utf |= (this_char & 0x3f); + this_char = (unsigned short)utf; + } else { + *status = FAILURE; + } + + } else if (!(this_char >= 0x00 && this_char <= 0x7f)) { + *status = FAILURE; + } } break; case cs_big5: @@ -648,8 +736,13 @@ MB_WRITE(next_char); this_char |= next_char; pos++; + } else { + *status = FAILURE; } - + } else if (!(this_char >= 0x00 && this_char <= 0x7f) && + !(this_char >= 0xa1 && this_char <= 0xdf)) + { + *status = FAILURE; } break; } @@ -666,8 +759,9 @@ MB_WRITE(next_char); this_char |= next_char; pos++; + } else { + *status = FAILURE; } - } else if (this_char == 0x8e) { /* peek at the next char */ CHECK_LEN(pos, 1); @@ -678,8 +772,9 @@ MB_WRITE(next_char); this_char |= next_char; pos++; + } else { + *status = FAILURE; } - } else if (this_char == 0x8f) { /* peek at the next two char */ unsigned char next2_char; @@ -697,8 +792,11 @@ MB_WRITE(next2_char); this_char |= next2_char; pos++; + } else { + *status = FAILURE; } - + } else if (!(this_char >= 0x00 && this_char <= 0x7f)) { + *status = FAILURE; } break; }