// utf8purify.c // // Copyright (C) 2007, 2011 by Clifford Wolf // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // Read a wild mix of utf8 and latin1 from input // Write pure utf8 to output // // A byte walks into a bar and orders a pint. Bartender asks him "What's // wrong?" Byte says "Parity error." Bartender nods and says "Yeah, I // thought you looked a bit off." #include #include #include #include #include // lookahead buffer unsigned char *inputstr = NULL; unsigned char *outputstr = NULL; int labuffer[3]; // read a byte from input and shift thru the lookahead buffer int readbyte() { int retval = labuffer[0]; labuffer[0] = labuffer[1]; labuffer[1] = labuffer[2]; labuffer[2] = -1; unsigned char buffer; if (inputstr != NULL) { if (*inputstr) labuffer[2] = *(inputstr++); } else { if (read(0, &buffer, 1) == 1) labuffer[2] = buffer; } return retval; } // write a byte to the output void writebyte(int ch) { unsigned char buffer = (unsigned)ch; if (outputstr) { *(outputstr++) = buffer; *outputstr = 0; } else { if (write(1, &buffer, 1) != 1) perror("Single byte write failed"); } } void worker() { // fill lookahead buffer readbyte(); readbyte(); readbyte(); // main loop while (1) { // read the first byte (rest is in the lookahead buffer) int ch = readbyte(); // we have reached the end of the input if (ch < 0) break; // just normal 7-bit ascii if ((ch & 0x80) == 0) { writebyte(ch); continue; } // 2-byte utf-8 sequence (starting with '110') if ((ch & 0xE0) == 0xC0 && (labuffer[0] & 0xC0) == 0x80) { writebyte(ch); writebyte(readbyte()); continue; } // 3-byte utf-8 sequence (starting with '1110') if ((ch & 0xF0) == 0xE0 && (labuffer[0] & 0xC0) == 0x80 && (labuffer[1] & 0xC0) == 0x80) { writebyte(ch); writebyte(readbyte()); writebyte(readbyte()); continue; } // 4-byte utf-8 sequence (starting with '11110') if ((ch & 0xF8) == 0xF0 && (labuffer[0] & 0xC0) == 0x80 && (labuffer[1] & 0xC0) == 0x80 && (labuffer[2] & 0xC0) == 0x80) { writebyte(ch); writebyte(readbyte()); writebyte(readbyte()); writebyte(readbyte()); continue; } // so it seams to be a latin1 character writebyte(0xC0 | (ch >> 6)); writebyte(0x80 | (ch & 0x3f)); } } int main(int argc, char **argv) { int i; char *buf; if (argc <= 1) { worker(); } else { for (i = 1; i < argc; i++) { inputstr = (unsigned char*)argv[i]; buf = malloc(strlen(argv[i])*2 + 2); outputstr = (unsigned char*)buf; *outputstr = 0; worker(); if (strcmp(argv[i], buf)) { printf("%s: `%s' -> `%s'\n", argv[0], argv[i], buf); if (rename(argv[i], buf) < 0) fprintf(stderr, "%s: Can't rename `%s' to `%s': %s\n", argv[0], argv[i], buf, strerror(errno)); } free(buf); } } return 0; }