/* * SPL - The SPL Programming Language * Copyright (C) 2004, 2005 Clifford Wolf * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * utf8.c: Simple utf8 conversion and check functions */ #include #include #include #include "spl.h" #include "utf8tab.h" typedef unsigned char charset_map_table_type[4]; static struct { charset_map_table_type *table; char *name; } charset_map[] = { { UTF8TAB_ISO8859_1, "ascii" }, { UTF8TAB_ISO8859_1, "latin_1" }, { UTF8TAB_ISO8859_1, "iso8859_1" }, { 0, 0 } }; const char *spl_utf8_check(const char *text) { const unsigned char *utext = (const unsigned char *)text; int inc = 0; for (const unsigned char *t = utext; *t; t += inc + 1) { #define IN(off, mask, value) ((t[inc=off] & (mask)) == (value)) if (IN(0, 0x80, 0x00)) continue; if (IN(0, 0xE0, 0xC0) && IN(1, 0xC0, 0x80)) continue; if (IN(0, 0xF0, 0xE0) && IN(1, 0xC0, 0x80) && IN(2, 0xC0, 0x80)) continue; if (IN(0, 0xF8, 0xF0) && IN(1, 0xC0, 0x80) && IN(2, 0xC0, 0x80) && IN(3, 0xC0, 0x80)) continue; return (char*)t; #undef IN } return 0; } char *spl_utf8_import(const char *text, const char *charset) { const unsigned char *utext = (const unsigned char *)text; charset_map_table_type *tab = 0; if (!strcmp("utf_8", charset)) return strdup(text); for (int i=0; charset_map[i].table; i++) if (!strcmp(charset_map[i].name, charset)) tab = charset_map[i].table; if (!tab) return 0; int result_len = 0; for (const unsigned char *t = utext; *t; t++) if ((*t & 0x80) == 0) result_len++; else result_len += strlen((char*)(tab[*t-128])); char *result = malloc(result_len+1); char *r = result; for (const unsigned char *t = utext; *t; t++) if ((*t & 0x80) == 0) *(r++) = *t; else { strcpy(r, (const char*)tab[*t-128]); r += strlen((const char*)tab[*t-128]); } assert(r == result+result_len); result[result_len] = 0; return result; } char *spl_utf8_export(const char *text, const char *charset) { const unsigned char *utext = (const unsigned char *)text; charset_map_table_type *tab = 0; if (!strcmp("utf_8", charset)) return strdup(text); for (int i=0; charset_map[i].table; i++) if (!strcmp(charset_map[i].name, charset)) tab = charset_map[i].table; if (!tab) return 0; int result_len = 0; char *result = malloc(strlen(text)+1); int inc = 0; for (const unsigned char *t = utext; *t; t += inc + 1) { #define IN(off, mask, value) ((t[inc=off] & (mask)) == (value)) if (IN(0, 0x80, 0x00)) { result[result_len++] = t[0]; continue; } for (int i=0; i<128; i++) { inc = strlen((const char*)tab[i]); if (!strncmp((const char*)t, (const char*)tab[i], inc)) { result[result_len++] = 128+i; // in fact a strlen of 2 would result in t increase of 3 // decrement inc to get correct t increment in for loop // raphael, 2007-10-08 inc--; goto next_export_char; } } result[result_len++] = '?'; if (IN(0, 0xE0, 0xC0) && IN(1, 0xC0, 0x80)) continue; if (IN(0, 0xF0, 0xE0) && IN(1, 0xC0, 0x80) && IN(2, 0xC0, 0x80)) continue; if (IN(0, 0xF8, 0xF0) && IN(1, 0xC0, 0x80) && IN(2, 0xC0, 0x80) && IN(3, 0xC0, 0x80)) continue; /* Input is not UTF-8. This should not happen.. */ for (inc=1; t[inc] & 0x80; t++) result[result_len++] = '?'; next_export_char:; #undef IN } result[result_len++] = 0; return realloc(result, result_len); }