118 | 118 | // If the scanner hasn't found an explicitly defined encoding, check for either EFBBBF, FEFF or FFFE and, if found, set the encoding to UTF-8 or UTF-16 |
119 | 119 | if (!foundExplicitEncoding && [textData length] > 2) { |
120 | 120 | NSString *lookForEncodingInBytesString = [NSString stringWithString:[textData description]]; |
121 | 121 | if ([[lookForEncodingInBytesString substringWithRange:NSMakeRange(1,6)] isEqualToString:@"efbbbf"]) encoding = NSUTF8StringEncoding; |
122 | 122 | else if ([[lookForEncodingInBytesString substringWithRange:NSMakeRange(1,4)] isEqualToString:@"feff"] || [[lookForEncodingInBytesString substringWithRange:NSMakeRange(1,4)] isEqualToString:@"fffe"]) encoding = NSUnicodeStringEncoding; |
123 | 123 | } |
124 | | |
| 124 | |
| 125 | unsigned int pos,utf8,jp2022,euc,sjis,utf16,r; |
| 126 | pos = utf8 = euc = sjis = jp2022 = r = 0; |
| 127 | |
| 128 | if (encoding == 0 && [textData length] > 2) |
| 129 | { |
| 130 | unsigned int nLen = [textData length]; |
| 131 | unsigned char* charData = (unsigned char*)[textData bytes]; |
| 132 | while(pos < nLen) |
| 133 | { |
| 134 | if ((r=[self isUTF8:charData+pos nLen:nLen-pos])>0) |
| 135 | { |
| 136 | utf8++; |
| 137 | pos+=r; |
| 138 | } |
| 139 | else if ([self isEUC:charData+pos nLen:nLen-pos]) |
| 140 | { |
| 141 | euc++; |
| 142 | pos+=2; |
| 143 | } |
| 144 | else if ([self isSJIS:charData+pos nLen:nLen-pos]) |
| 145 | { |
| 146 | sjis++; |
| 147 | pos+=2; |
| 148 | } |
| 149 | else if ((r=[self is2022JP:charData+pos nLen:nLen-pos])>0) |
| 150 | { |
| 151 | jp2022++; |
| 152 | pos+=r; |
| 153 | } |
| 154 | else |
| 155 | { |
| 156 | pos++; |
| 157 | } |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | if (utf8 > 0 && utf8 > euc && utf8 > sjis && utf8 > jp2022) |
| 162 | { |
| 163 | encoding = NSUTF8StringEncoding; |
| 164 | } |
| 165 | else if (euc > 0 && euc > utf8 && euc > sjis && euc > jp2022) |
| 166 | { |
| 167 | encoding = NSJapaneseEUCStringEncoding; |
| 168 | } |
| 169 | else if (sjis > 0 && sjis > utf8 && sjis > euc && sjis > jp2022) |
| 170 | { |
| 171 | encoding = NSShiftJISStringEncoding; |
| 172 | } |
| 173 | else if (jp2022 > 0 && jp2022 > utf8 && jp2022 > euc && jp2022 > sjis) |
| 174 | { |
| 175 | encoding = NSISO2022JPStringEncoding; |
| 176 | } |
| 177 | |
| 178 | NSString* try_str = [[[NSString alloc] initWithData:textData encoding:encoding] autorelease]; |
| 179 | if (try_str != nil) return encoding; |
| 180 | |
| 181 | NSStringEncoding encodings[] = |
| 182 | { |
| 183 | -2147483647,// MacOS Japanese |
| 184 | NSUnicodeStringEncoding, |
| 185 | 0 |
| 186 | }; |
| 187 | |
| 188 | // try to encoding |
| 189 | int i = 0; |
| 190 | while(encodings[i] != 0) |
| 191 | { |
| 192 | try_str = [[[NSString alloc] initWithData:textData encoding:encodings[i]] autorelease]; |
| 193 | if (try_str != nil) |
| 194 | { |
| 195 | encoding = encodings[i]; |
| 196 | break; |
| 197 | } |
| 198 | i++; |
| 199 | } |
| 226 | -(BOOL)isUTF16:(unsigned char*)charData nLen:(unsigned int)nLen |
| 227 | {// NOT USED |
| 228 | if (nLen < 2) return NO; |
| 229 | // surrogate |
| 230 | if ((*charData >= 0xd8 && *charData <= 0xdb) || (*charData >= 0xdc && *charData <= 0xdf)) |
| 231 | { |
| 232 | return YES; |
| 233 | } |
| 234 | if ((*charData <= 0xd7) || (*charData >= 0xe0)) |
| 235 | { |
| 236 | return YES; |
| 237 | } |
| 238 | return NO; |
| 239 | } |
| 240 | |
| 241 | -(unsigned int)is2022JP:(unsigned char*)charData nLen:(unsigned int)nLen |
| 242 | { |
| 243 | if (nLen < 3) return 0; |
| 244 | if (*charData == 0x1b && *(charData+1) == 0x24 && (*(charData+2) == 0x40 || *(charData+2) == 0x42)) |
| 245 | {// ESC$@(78JIS) || ESC$B(83JIS) |
| 246 | return 3; |
| 247 | } |
| 248 | return 0; |
| 249 | } |
| 250 | |
| 251 | -(unsigned int)isUTF8:(const unsigned char *)charData nLen:(unsigned int)nLen |
| 252 | { |
| 253 | if( (nLen >= 3) && |
| 254 | (*(charData+0) & (unsigned char)0xf0)==(unsigned char)0xe0 && |
| 255 | (*(charData+1) & (unsigned char)0xc0)==(unsigned char)0x80 && |
| 256 | (*(charData+2) & (unsigned char)0xc0)==(unsigned char)0x80) |
| 257 | { |
| 258 | return 3; |
| 259 | } |
| 260 | if( (nLen >= 2) && |
| 261 | (*(charData+0) & (unsigned char)0xe0)==(unsigned char)0xc0 && |
| 262 | (*(charData+1) & (unsigned char)0xc0)==(unsigned char)0x80) |
| 263 | { |
| 264 | return 2; |
| 265 | } |
| 266 | return 0; |
| 267 | } |
| 268 | |
| 269 | -(BOOL)isSJIS:(unsigned char*)charData nLen:(unsigned int)nLen |
| 270 | { |
| 271 | if (nLen < 2) return NO; |
| 272 | if ((*charData >= 0x81 && *charData <= 0x9f) || (*charData >= 0xe0 && *charData <= 0xfc)) |
| 273 | { |
| 274 | if ((*(charData+1) >= 0x40 && *(charData+1)< 0xfc) && (*(charData+1) != 0x7f)) |
| 275 | { |
| 276 | return YES; |
| 277 | } |
| 278 | } |
| 279 | return NO; |
| 280 | } |
| 281 | |
| 282 | -(BOOL)isEUC:(unsigned char*)charData nLen:(unsigned int)nLen |
| 283 | { |
| 284 | if (nLen < 2) return NO; |
| 285 | if ((*charData >= 0x8e && *charData <= 0xfe) && (*charData != 0x8f && *charData != 0xa0)) |
| 286 | { |
| 287 | if (*(charData+1) >= 0xa0 && *(charData+1) <= 0xfe) |
| 288 | { |
| 289 | return YES; |
| 290 | } |
| 291 | } |
| 292 | return NO; |
| 293 | } |
| 294 | |