textconv.c (15630B)
1 #include <ctype.h> 2 #include <errno.h> 3 #include <stdarg.h> 4 #include <stdint.h> 5 #include <stdlib.h> 6 #include <stdio.h> 7 #include <string.h> 8 9 #include "hashtable.h" 10 #include "utf8.h" 11 12 #define ARRAY_COUNT(arr) (sizeof(arr) / sizeof(arr[0])) 13 14 #define INVALID_CHAR 0xFFFFFFFF 15 16 struct CharmapEntry 17 { 18 uint32_t unicode[3]; 19 int length; // length of the unicode array. TODO: use dynamic memory allocation 20 int bytesCount; 21 uint8_t bytes[4]; // bytes to convert unicode array to, (e.g. 'A' = 0x0A) 22 }; 23 24 static struct HashTable *charmap; 25 26 static void fatal_error(const char *msgfmt, ...) 27 { 28 va_list args; 29 30 fputs("error: ", stderr); 31 32 va_start(args, msgfmt); 33 vfprintf(stderr, msgfmt, args); 34 va_end(args); 35 36 fputc('\n', stderr); 37 38 exit(1); 39 } 40 41 static void parse_error(const char *filename, int lineNum, const char *msgfmt, ...) 42 { 43 va_list args; 44 45 fprintf(stderr, "%s: line %i: ", filename, lineNum); 46 47 va_start(args, msgfmt); 48 vfprintf(stderr, msgfmt, args); 49 va_end(args); 50 51 fputc('\n', stderr); 52 53 exit(1); 54 } 55 56 // Reads the whole file and returns a null-terminated buffer with its contents 57 void *read_text_file(const char *filename) 58 { 59 if (strcmp(filename, "-") != 0) 60 { 61 FILE *file = fopen(filename, "rb"); 62 uint8_t *buffer; 63 size_t size; 64 65 if (file == NULL) 66 fatal_error("failed to open file '%s' for reading: %s", filename, strerror(errno)); 67 68 // get size 69 fseek(file, 0, SEEK_END); 70 size = ftell(file); 71 72 // allocate buffer 73 buffer = malloc(size + 1); 74 if (buffer == NULL) 75 fatal_error("could not allocate buffer of size %u", (uint32_t)(size + 1)); 76 77 // read file 78 fseek(file, 0, SEEK_SET); 79 if (fread(buffer, size, 1, file) != 1) 80 fatal_error("error reading from file '%s': %s", filename, strerror(errno)); 81 82 // null-terminate the buffer 83 buffer[size] = 0; 84 85 fclose(file); 86 87 return buffer; 88 } 89 else 90 { 91 size_t size = 0; 92 size_t capacity = 1024; 93 uint8_t *buffer = malloc(capacity + 1); 94 95 if (buffer == NULL) 96 fatal_error("could not allocate buffer of size %u", (uint32_t)(capacity + 1)); 97 98 for (;;) 99 { 100 size += fread(buffer + size, 1, capacity - size, stdin); 101 if (size == capacity) 102 { 103 capacity *= 2; 104 buffer = realloc(buffer, capacity + 1); 105 if (buffer == NULL) 106 fatal_error("could not allocate buffer of size %u", (uint32_t)(capacity + 1)); 107 } 108 else if (feof(stdin)) 109 { 110 break; 111 } 112 else 113 { 114 fatal_error("error reading from stdin: %s", strerror(errno)); 115 } 116 } 117 118 // null-terminate the buffer 119 buffer[size] = 0; 120 return buffer; 121 } 122 } 123 124 static char *skip_whitespace(char *str) 125 { 126 while (isspace(*str)) 127 str++; 128 return str; 129 } 130 131 // null terminates the current line and returns a pointer to the next line 132 static char *line_split(char *str) 133 { 134 while (*str != '\n') 135 { 136 if (*str == 0) 137 return str; // end of string 138 str++; 139 } 140 *str = 0; // terminate line 141 return str + 1; 142 } 143 144 static char *parse_number(const char *str, unsigned int *num) 145 { 146 char *endptr; 147 unsigned int n = strtol(str, &endptr, 0); 148 149 *num = n; 150 if (endptr > str) 151 return endptr; 152 else 153 return NULL; 154 } 155 156 static int is_identifier_char(char c) 157 { 158 return isalnum(c) || c == '_'; 159 } 160 161 static uint32_t get_escape_char(int c) 162 { 163 const uint8_t escapeTable[] = 164 { 165 ['0'] = '\0', 166 ['a'] = '\a', 167 ['b'] = '\b', 168 ['f'] = '\f', 169 ['n'] = '\n', 170 ['r'] = '\r', 171 ['t'] = '\t', 172 ['v'] = '\v', 173 ['\\'] = '\\', 174 ['\''] = '\'', 175 ['"'] = '"', 176 }; 177 178 if ((unsigned int)c < ARRAY_COUNT(escapeTable) && (escapeTable[c] != 0 || c == '0')) 179 return escapeTable[c]; 180 else 181 return INVALID_CHAR; 182 } 183 184 static void read_charmap(const char *filename) 185 { 186 char *filedata = read_text_file(filename); 187 char *line = filedata; 188 int lineNum = 1; 189 190 while (line[0] != 0) 191 { 192 char *nextLine = line_split(line); 193 194 struct CharmapEntry entry; 195 struct CharmapEntry *existing; 196 197 line = skip_whitespace(line); 198 if (line[0] != 0 && !(line[0] == '/' && line[1] == '/')) // ignore empty lines and comments 199 { 200 int len = 0; 201 /* Read Character */ 202 203 // opening quote 204 if (*line != '\'') 205 parse_error(filename, lineNum, "expected '"); 206 line++; 207 208 // perform analysis of charmap entry, we are in the quote 209 while(1) 210 { 211 if(*line == '\'') 212 { 213 line++; 214 break; 215 } 216 else if(len == ARRAY_COUNT(entry.unicode)) 217 { 218 // TODO: Use dynamic memory allocation so this is unnecessary. 219 parse_error(filename, lineNum, "string limit exceeded"); 220 } 221 else if (*line == '\\') 222 { 223 line++; // advance to get the character being escaped 224 if (*line == '\r') 225 line++; 226 if (*line == '\n') 227 { 228 // Backslash at end of line is ignored 229 continue; 230 } 231 entry.unicode[len] = get_escape_char(*line); 232 if (entry.unicode[len] == INVALID_CHAR) 233 parse_error(filename, lineNum, "unknown escape sequence \\%c", *line); 234 line++; // increment again to get past the escape sequence. 235 } 236 else 237 { 238 line = utf8_decode(line, &entry.unicode[len]); 239 if (line == NULL) 240 parse_error(filename, lineNum, "invalid UTF8"); 241 } 242 len++; 243 } 244 entry.length = len; 245 246 // equals sign 247 line = skip_whitespace(line); 248 if (*line != '=') 249 parse_error(filename, lineNum, "expected = after character \\%c", *line); 250 line++; 251 252 entry.bytesCount = 0; 253 254 // value 255 while (1) 256 { 257 uint32_t value; 258 259 if (entry.bytesCount >= 4) 260 parse_error(filename, lineNum, "more than 4 values specified"); 261 262 line = skip_whitespace(line); 263 264 line = parse_number(line, &value); 265 if (line == NULL) 266 parse_error(filename, lineNum, "expected number after ="); 267 if (value > 0xFF) 268 parse_error(filename, lineNum, "0x%X is larger than 1 byte", value); 269 270 entry.bytes[entry.bytesCount] = value; 271 entry.bytesCount++; 272 273 line = skip_whitespace(line); 274 if (*line == 0) 275 break; 276 if (*line != ',') 277 parse_error(filename, lineNum, "junk at end of line"); 278 line++; 279 } 280 281 existing = hashtable_query(charmap, &entry); 282 283 if (existing != NULL) { 284 const char *fmt = "0x%02X, "; 285 int fmtlen = 6; 286 287 char str[32]; 288 int i; 289 290 for (i = 0; i < existing->bytesCount; i++) { 291 sprintf(&str[fmtlen * i], fmt, existing->bytes[i]); 292 } 293 294 str[fmtlen * i - 2] = '\0'; 295 296 parse_error(filename, lineNum, "entry for character already exists (%s)", str); 297 } else { 298 hashtable_insert(charmap, &entry); 299 } 300 } 301 302 line = nextLine; 303 lineNum++; 304 } 305 306 free(filedata); 307 } 308 309 static int count_line_num(const char *start, const char *pos) 310 { 311 const char *c; 312 int lineNum = 1; 313 314 for (c = start; c < pos; c++) 315 { 316 if (*c == '\n') 317 lineNum++; 318 } 319 return lineNum; 320 } 321 322 static char *convert_string(char *pos, FILE *fout, const char *inputFileName, char *start, int uncompressed, int cnOneByte) 323 { 324 const struct CharmapEntry terminatorInput = {.unicode = {'\0'}, .length = 1}; 325 struct CharmapEntry *terminator; 326 int hasString = 0; 327 int i; 328 329 while (1) 330 { 331 pos = skip_whitespace(pos); 332 if (*pos == ')') 333 { 334 if (hasString) 335 break; 336 else 337 parse_error(inputFileName, count_line_num(start, pos), "expected quoted string after '_('"); 338 } 339 else if (*pos != '"') 340 parse_error(inputFileName, count_line_num(start, pos), "unexpected character '%c'", *pos); 341 pos++; 342 343 hasString = 1; 344 345 // convert quoted string 346 while (*pos != '"') 347 { 348 struct CharmapEntry input; 349 struct CharmapEntry *last_valid_entry = NULL; 350 struct CharmapEntry *entry; 351 uint32_t c; 352 int length = 0; 353 char* last_valid_pos = NULL; 354 // safely erase the unicode area before use 355 memset(input.unicode, 0, sizeof (input.unicode)); 356 input.length = 0; 357 358 // Find a charmap entry of longest length possible starting from this position 359 while (*pos != '"') 360 { 361 if ((uncompressed && length == 1) || length == ARRAY_COUNT(entry->unicode)) 362 { 363 // Stop searching after length 3; we only support strings of lengths up 364 // to that right now. Unless uncompressed is set, in which we ignore multi 365 // texts by discarding entries longer than 1. 366 break; 367 } 368 369 if (*pos == 0) 370 parse_error(inputFileName, count_line_num(start, pos), "EOF in string literal"); 371 if (*pos == '\\') 372 { 373 pos++; 374 c = get_escape_char(*pos); 375 if (c == INVALID_CHAR) 376 parse_error(inputFileName, count_line_num(start, pos), "unknown escape sequence \\%c", *pos); 377 input.unicode[length] = c; 378 pos++; 379 } 380 else 381 { 382 pos = utf8_decode(pos, &input.unicode[length]); 383 if (pos == NULL) 384 parse_error(inputFileName, count_line_num(start, pos), "invalid unicode encountered in file"); 385 } 386 length++; 387 input.length = length; 388 389 entry = hashtable_query(charmap, &input); 390 if (entry != NULL) 391 { 392 last_valid_entry = entry; 393 last_valid_pos = pos; 394 } 395 } 396 397 entry = last_valid_entry; 398 pos = last_valid_pos; 399 if (entry == NULL) 400 parse_error(inputFileName, count_line_num(start, pos), "no charmap entry for U+%X", input.unicode[0]); 401 402 for (i = 0; i < entry->bytesCount; i++) { 403 if (entry->bytesCount > 1 && cnOneByte && i % 2 == 0) { 404 continue; 405 } 406 fprintf(fout, "0x%02X,", entry->bytes[i]); 407 } 408 } 409 pos++; // skip over closing '"' 410 } 411 pos++; // skip over closing ')' 412 // use terminator \0 from charmap if provided, otherwise default 0xFF 413 terminator = hashtable_query(charmap, &terminatorInput); 414 if (terminator == NULL) 415 fputs("0xFF", fout); 416 else 417 { 418 for (i = 0; i < (cnOneByte ? 1 : terminator->bytesCount); i++) 419 fprintf(fout, "0x%02X,", terminator->bytes[i]); 420 } 421 return pos; 422 } 423 424 static void convert_file(const char *infilename, const char *outfilename) 425 { 426 char *in = read_text_file(infilename); 427 FILE *fout = strcmp(outfilename, "-") != 0 ? fopen(outfilename, "wb") : stdout; 428 429 if (fout == NULL) 430 fatal_error("failed to open file '%s' for writing: %s", strerror(errno)); 431 432 char *start = in; 433 char *end = in; 434 char *pos = in; 435 436 while (1) 437 { 438 if (*pos == 0) // end of file 439 goto eof; 440 441 // check for comment 442 if (*pos == '/') 443 { 444 pos++; 445 // skip over // comment 446 if (*pos == '/') 447 { 448 pos++; 449 // skip over next newline 450 while (*pos != '\n') 451 { 452 if (*pos == 0) 453 goto eof; 454 pos++; 455 } 456 pos++; 457 } 458 // skip over /* */ comment 459 else if (*pos == '*') 460 { 461 pos++; 462 while (*pos != '*' && pos[1] != '/') 463 { 464 if (*pos == 0) 465 goto eof; 466 pos++; 467 } 468 pos += 2; 469 } 470 } 471 // skip over normal string literal 472 else if (*pos == '"') 473 { 474 pos++; 475 while (*pos != '"') 476 { 477 if (*pos == 0) 478 goto eof; 479 if (*pos == '\\') 480 pos++; 481 pos++; 482 } 483 pos++; 484 } 485 // check for _( sequence 486 else if ((*pos == '_') && (pos == in || !is_identifier_char(pos[-1]))) 487 { 488 int uncompressed = 0; 489 int cnOneByte = 0; 490 end = pos; 491 pos++; 492 if (*pos == '_') // an extra _ signifies uncompressed strings. Enable uncompressed flag 493 { 494 pos++; 495 uncompressed = 1; 496 } 497 if (*pos == '%') // an extra % signifies a one-byte long characters on iQue instead of two-byte 498 { 499 pos++; 500 cnOneByte = 1; 501 } 502 if (*pos == '(') 503 { 504 pos++; 505 fwrite(start, end - start, 1, fout); 506 pos = convert_string(pos, fout, infilename, in, uncompressed, cnOneByte); 507 start = pos; 508 } 509 } 510 else 511 { 512 pos++; 513 } 514 } 515 516 eof: 517 fwrite(start, pos - start, 1, fout); 518 if (strcmp(outfilename, "-") != 0) 519 fclose(fout); 520 free(in); 521 } 522 523 static unsigned int charmap_hash(const void *value) 524 { 525 const struct CharmapEntry* entry = value; 526 unsigned int ret = 0; 527 for (int i = 0; i < entry->length; i++) 528 ret = ret * 17 + entry->unicode[i]; 529 return ret; 530 } 531 532 static int charmap_cmp(const void *a, const void *b) 533 { 534 const struct CharmapEntry *ea = a; 535 const struct CharmapEntry *eb = b; 536 if (ea->length != eb->length) 537 return 0; 538 for(int i = 0; i < ea->length; i++) 539 if(ea->unicode[i] != eb->unicode[i]) 540 return 0; 541 return 1; 542 } 543 544 static void usage(const char *execName) 545 { 546 fprintf(stderr, "Usage: %s CHARMAP INPUT OUTPUT\n", execName); 547 } 548 549 int main(int argc, char **argv) 550 { 551 if (argc != 4) 552 { 553 usage(argv[0]); 554 return 1; 555 } 556 557 charmap = hashtable_new(charmap_hash, charmap_cmp, 256, sizeof(struct CharmapEntry)); 558 559 read_charmap(argv[1]); 560 convert_file(argv[2], argv[3]); 561 562 hashtable_free(charmap); 563 564 return 0; 565 }