sm64

A Super Mario 64 decompilation
Log | Files | Refs | README | LICENSE

textconv.c (15630B)


      1 #include <ctype.h>
      2 #include <errno.h>
      3 #include <stdarg.h>
      4 #include <stdint.h>
      5 #include <stdlib.h>
      6 #include <stdio.h>
      7 #include <string.h>
      8 
      9 #include "hashtable.h"
     10 #include "utf8.h"
     11 
     12 #define ARRAY_COUNT(arr) (sizeof(arr) / sizeof(arr[0]))
     13 
     14 #define INVALID_CHAR 0xFFFFFFFF
     15 
     16 struct CharmapEntry
     17 {
     18     uint32_t unicode[3];
     19     int length; // length of the unicode array. TODO: use dynamic memory allocation
     20     int bytesCount;
     21     uint8_t bytes[4]; // bytes to convert unicode array to, (e.g. 'A' = 0x0A)
     22 };
     23 
     24 static struct HashTable *charmap;
     25 
     26 static void fatal_error(const char *msgfmt, ...)
     27 {
     28     va_list args;
     29 
     30     fputs("error: ", stderr);
     31 
     32     va_start(args, msgfmt);
     33     vfprintf(stderr, msgfmt, args);
     34     va_end(args);
     35 
     36     fputc('\n', stderr);
     37 
     38     exit(1);
     39 }
     40 
     41 static void parse_error(const char *filename, int lineNum, const char *msgfmt, ...)
     42 {
     43     va_list args;
     44 
     45     fprintf(stderr, "%s: line %i: ", filename, lineNum);
     46 
     47     va_start(args, msgfmt);
     48     vfprintf(stderr, msgfmt, args);
     49     va_end(args);
     50 
     51     fputc('\n', stderr);
     52 
     53     exit(1);
     54 }
     55 
     56 // Reads the whole file and returns a null-terminated buffer with its contents
     57 void *read_text_file(const char *filename)
     58 {
     59     if (strcmp(filename, "-") != 0)
     60     {
     61         FILE *file = fopen(filename, "rb");
     62         uint8_t *buffer;
     63         size_t size;
     64 
     65         if (file == NULL)
     66             fatal_error("failed to open file '%s' for reading: %s", filename, strerror(errno));
     67 
     68         // get size
     69         fseek(file, 0, SEEK_END);
     70         size = ftell(file);
     71 
     72         // allocate buffer
     73         buffer = malloc(size + 1);
     74         if (buffer == NULL)
     75             fatal_error("could not allocate buffer of size %u", (uint32_t)(size + 1));
     76 
     77         // read file
     78         fseek(file, 0, SEEK_SET);
     79         if (fread(buffer, size, 1, file) != 1)
     80             fatal_error("error reading from file '%s': %s", filename, strerror(errno));
     81 
     82         // null-terminate the buffer
     83         buffer[size] = 0;
     84 
     85         fclose(file);
     86 
     87         return buffer;
     88     }
     89     else
     90     {
     91         size_t size = 0;
     92         size_t capacity = 1024;
     93         uint8_t *buffer = malloc(capacity + 1);
     94 
     95         if (buffer == NULL)
     96             fatal_error("could not allocate buffer of size %u", (uint32_t)(capacity + 1));
     97 
     98         for (;;)
     99         {
    100             size += fread(buffer + size, 1, capacity - size, stdin);
    101             if (size == capacity)
    102             {
    103                 capacity *= 2;
    104                 buffer = realloc(buffer, capacity + 1);
    105                 if (buffer == NULL)
    106                     fatal_error("could not allocate buffer of size %u", (uint32_t)(capacity + 1));
    107             }
    108             else if (feof(stdin))
    109             {
    110                 break;
    111             }
    112             else
    113             {
    114                 fatal_error("error reading from stdin: %s", strerror(errno));
    115             }
    116         }
    117 
    118         // null-terminate the buffer
    119         buffer[size] = 0;
    120         return buffer;
    121     }
    122 }
    123 
    124 static char *skip_whitespace(char *str)
    125 {
    126     while (isspace(*str))
    127         str++;
    128     return str;
    129 }
    130 
    131 // null terminates the current line and returns a pointer to the next line
    132 static char *line_split(char *str)
    133 {
    134     while (*str != '\n')
    135     {
    136         if (*str == 0)
    137             return str;  // end of string
    138         str++;
    139     }
    140     *str = 0;  // terminate line
    141     return str + 1;
    142 }
    143 
    144 static char *parse_number(const char *str, unsigned int *num)
    145 {
    146     char *endptr;
    147     unsigned int n = strtol(str, &endptr, 0);
    148 
    149     *num = n;
    150     if (endptr > str)
    151         return endptr;
    152     else
    153         return NULL;
    154 }
    155 
    156 static int is_identifier_char(char c)
    157 {
    158     return isalnum(c) || c == '_';
    159 }
    160 
    161 static uint32_t get_escape_char(int c)
    162 {
    163     const uint8_t escapeTable[] =
    164     {
    165         ['0'] = '\0',
    166         ['a'] = '\a',
    167         ['b'] = '\b',
    168         ['f'] = '\f',
    169         ['n'] = '\n',
    170         ['r'] = '\r',
    171         ['t'] = '\t',
    172         ['v'] = '\v',
    173         ['\\'] = '\\',
    174         ['\''] = '\'',
    175         ['"'] = '"',
    176     };
    177 
    178     if ((unsigned int)c < ARRAY_COUNT(escapeTable) && (escapeTable[c] != 0 || c == '0'))
    179         return escapeTable[c];
    180     else
    181         return INVALID_CHAR;
    182 }
    183 
    184 static void read_charmap(const char *filename)
    185 {
    186     char *filedata = read_text_file(filename);
    187     char *line = filedata;
    188     int lineNum = 1;
    189 
    190     while (line[0] != 0)
    191     {
    192         char *nextLine = line_split(line);
    193 
    194         struct CharmapEntry entry;
    195         struct CharmapEntry *existing;
    196 
    197         line = skip_whitespace(line);
    198         if (line[0] != 0 && !(line[0] == '/' && line[1] == '/'))  // ignore empty lines and comments
    199         {
    200             int len = 0;
    201             /* Read Character */
    202 
    203             // opening quote
    204             if (*line != '\'')
    205                 parse_error(filename, lineNum, "expected '");
    206             line++;
    207 
    208             // perform analysis of charmap entry, we are in the quote
    209             while(1)
    210             {
    211                 if(*line == '\'')
    212                 {
    213                     line++;
    214                     break;
    215                 }
    216                 else if(len == ARRAY_COUNT(entry.unicode))
    217                 {
    218                     // TODO: Use dynamic memory allocation so this is unnecessary.
    219                     parse_error(filename, lineNum, "string limit exceeded");
    220                 }
    221                 else if (*line == '\\')
    222                 {
    223                     line++; // advance to get the character being escaped
    224                     if (*line == '\r')
    225                         line++;
    226                     if (*line == '\n')
    227                     {
    228                         // Backslash at end of line is ignored
    229                         continue;
    230                     }
    231                     entry.unicode[len] = get_escape_char(*line);
    232                     if (entry.unicode[len] == INVALID_CHAR)
    233                         parse_error(filename, lineNum, "unknown escape sequence \\%c", *line);
    234                     line++; // increment again to get past the escape sequence.
    235                 }
    236                 else
    237                 {
    238                     line = utf8_decode(line, &entry.unicode[len]);
    239                     if (line == NULL)
    240                         parse_error(filename, lineNum, "invalid UTF8");
    241                 }
    242                 len++;
    243             }
    244             entry.length = len;
    245 
    246             // equals sign
    247             line = skip_whitespace(line);
    248             if (*line != '=')
    249                 parse_error(filename, lineNum, "expected = after character \\%c", *line);
    250             line++;
    251 
    252             entry.bytesCount = 0;
    253 
    254             // value
    255             while (1)
    256             {
    257                 uint32_t value;
    258 
    259                 if (entry.bytesCount >= 4)
    260                     parse_error(filename, lineNum, "more than 4 values specified");
    261 
    262                 line = skip_whitespace(line);
    263 
    264                 line = parse_number(line, &value);
    265                 if (line == NULL)
    266                     parse_error(filename, lineNum, "expected number after =");
    267                 if (value > 0xFF)
    268                     parse_error(filename, lineNum, "0x%X is larger than 1 byte", value);
    269 
    270                 entry.bytes[entry.bytesCount] = value;
    271                 entry.bytesCount++;
    272 
    273                 line = skip_whitespace(line);
    274                 if (*line == 0)
    275                     break;
    276                 if (*line != ',')
    277                     parse_error(filename, lineNum, "junk at end of line");
    278                 line++;
    279             }
    280 
    281             existing = hashtable_query(charmap, &entry);
    282 
    283             if (existing != NULL) {
    284                 const char *fmt = "0x%02X, ";
    285                 int fmtlen = 6;
    286 
    287                 char str[32];
    288                 int i;
    289 
    290                 for (i = 0; i < existing->bytesCount; i++) {
    291                     sprintf(&str[fmtlen * i], fmt, existing->bytes[i]);
    292                 }
    293 
    294                 str[fmtlen * i - 2] = '\0';
    295 
    296                 parse_error(filename, lineNum, "entry for character already exists (%s)", str);
    297             } else {
    298                 hashtable_insert(charmap, &entry);
    299             }
    300         }
    301 
    302         line = nextLine;
    303         lineNum++;
    304     }
    305 
    306     free(filedata);
    307 }
    308 
    309 static int count_line_num(const char *start, const char *pos)
    310 {
    311     const char *c;
    312     int lineNum = 1;
    313 
    314     for (c = start; c < pos; c++)
    315     {
    316         if (*c == '\n')
    317             lineNum++;
    318     }
    319     return lineNum;
    320 }
    321 
    322 static char *convert_string(char *pos, FILE *fout, const char *inputFileName, char *start, int uncompressed, int cnOneByte)
    323 {
    324     const struct CharmapEntry terminatorInput = {.unicode = {'\0'}, .length = 1};
    325     struct CharmapEntry *terminator;
    326     int hasString = 0;
    327     int i;
    328 
    329     while (1)
    330     {
    331         pos = skip_whitespace(pos);
    332         if (*pos == ')')
    333         {
    334             if (hasString)
    335                 break;
    336             else
    337                 parse_error(inputFileName, count_line_num(start, pos), "expected quoted string after '_('");
    338         }
    339         else if (*pos != '"')
    340             parse_error(inputFileName, count_line_num(start, pos), "unexpected character '%c'", *pos);
    341         pos++;
    342 
    343         hasString = 1;
    344 
    345         // convert quoted string
    346         while (*pos != '"')
    347         {
    348             struct CharmapEntry input;
    349             struct CharmapEntry *last_valid_entry = NULL;
    350             struct CharmapEntry *entry;
    351             uint32_t c;
    352             int length = 0;
    353             char* last_valid_pos = NULL;
    354             // safely erase the unicode area before use
    355             memset(input.unicode, 0, sizeof (input.unicode));
    356             input.length = 0;
    357 
    358             // Find a charmap entry of longest length possible starting from this position
    359             while (*pos != '"')
    360             {
    361                 if ((uncompressed && length == 1) || length == ARRAY_COUNT(entry->unicode))
    362                 {
    363                     // Stop searching after length 3; we only support strings of lengths up
    364                     // to that right now. Unless uncompressed is set, in which we ignore multi
    365                     // texts by discarding entries longer than 1.
    366                     break;
    367                 }
    368 
    369                 if (*pos == 0)
    370                     parse_error(inputFileName, count_line_num(start, pos), "EOF in string literal");
    371                 if (*pos == '\\')
    372                 {
    373                     pos++;
    374                     c = get_escape_char(*pos);
    375                     if (c == INVALID_CHAR)
    376                         parse_error(inputFileName, count_line_num(start, pos), "unknown escape sequence \\%c", *pos);
    377                     input.unicode[length] = c;
    378                     pos++;
    379                 }
    380                 else
    381                 {
    382                     pos = utf8_decode(pos, &input.unicode[length]);
    383                     if (pos == NULL)
    384                         parse_error(inputFileName, count_line_num(start, pos), "invalid unicode encountered in file");
    385                 }
    386                 length++;
    387                 input.length = length;
    388 
    389                 entry = hashtable_query(charmap, &input);
    390                 if (entry != NULL)
    391                 {
    392                     last_valid_entry = entry;
    393                     last_valid_pos = pos;
    394                 }
    395             }
    396 
    397             entry = last_valid_entry;
    398             pos = last_valid_pos;
    399             if (entry == NULL)
    400                 parse_error(inputFileName, count_line_num(start, pos), "no charmap entry for U+%X", input.unicode[0]);
    401 
    402             for (i = 0; i < entry->bytesCount; i++) {
    403                 if (entry->bytesCount > 1 && cnOneByte && i % 2 == 0) {
    404                     continue;
    405                 }
    406                 fprintf(fout, "0x%02X,", entry->bytes[i]);
    407             }
    408         }
    409         pos++;  // skip over closing '"'
    410     }
    411     pos++;  // skip over closing ')'
    412     // use terminator \0 from charmap if provided, otherwise default 0xFF
    413     terminator = hashtable_query(charmap, &terminatorInput);
    414     if (terminator == NULL)
    415         fputs("0xFF", fout);
    416     else
    417     {
    418         for (i = 0; i < (cnOneByte ? 1 : terminator->bytesCount); i++)
    419             fprintf(fout, "0x%02X,", terminator->bytes[i]);
    420     }
    421     return pos;
    422 }
    423 
    424 static void convert_file(const char *infilename, const char *outfilename)
    425 {
    426     char *in = read_text_file(infilename);
    427     FILE *fout = strcmp(outfilename, "-") != 0 ? fopen(outfilename, "wb") : stdout;
    428 
    429     if (fout == NULL)
    430         fatal_error("failed to open file '%s' for writing: %s", strerror(errno));
    431 
    432     char *start = in;
    433     char *end = in;
    434     char *pos = in;
    435 
    436     while (1)
    437     {
    438         if (*pos == 0)  // end of file
    439             goto eof;
    440 
    441         // check for comment
    442         if (*pos == '/')
    443         {
    444             pos++;
    445             // skip over // comment
    446             if (*pos == '/')
    447             {
    448                 pos++;
    449                 // skip over next newline
    450                 while (*pos != '\n')
    451                 {
    452                     if (*pos == 0)
    453                         goto eof;
    454                     pos++;
    455                 }
    456                 pos++;
    457             }
    458             // skip over /* */ comment
    459             else if (*pos == '*')
    460             {
    461                 pos++;
    462                 while (*pos != '*' && pos[1] != '/')
    463                 {
    464                     if (*pos == 0)
    465                         goto eof;
    466                     pos++;
    467                 }
    468                 pos += 2;
    469             }
    470         }
    471         // skip over normal string literal
    472         else if (*pos == '"')
    473         {
    474             pos++;
    475             while (*pos != '"')
    476             {
    477                 if (*pos == 0)
    478                     goto eof;
    479                 if (*pos == '\\')
    480                     pos++;
    481                 pos++;
    482             }
    483             pos++;
    484         }
    485         // check for _( sequence
    486         else if ((*pos == '_') && (pos == in || !is_identifier_char(pos[-1])))
    487         {
    488             int uncompressed = 0;
    489             int cnOneByte = 0;
    490             end = pos;
    491             pos++;
    492             if (*pos == '_') // an extra _ signifies uncompressed strings. Enable uncompressed flag
    493             {
    494                 pos++;
    495                 uncompressed = 1;
    496             }
    497             if (*pos == '%') // an extra % signifies a one-byte long characters on iQue instead of two-byte
    498             {
    499                 pos++;
    500                 cnOneByte = 1;
    501             }
    502             if (*pos == '(')
    503             {
    504                 pos++;
    505                 fwrite(start, end - start, 1, fout);
    506                 pos = convert_string(pos, fout, infilename, in, uncompressed, cnOneByte);
    507                 start = pos;
    508             }
    509         }
    510         else
    511         {
    512             pos++;
    513         }
    514     }
    515 
    516   eof:
    517     fwrite(start, pos - start, 1, fout);
    518     if (strcmp(outfilename, "-") != 0)
    519         fclose(fout);
    520     free(in);
    521 }
    522 
    523 static unsigned int charmap_hash(const void *value)
    524 {
    525     const struct CharmapEntry* entry = value;
    526     unsigned int ret = 0;
    527     for (int i = 0; i < entry->length; i++)
    528         ret = ret * 17 + entry->unicode[i];
    529     return ret;
    530 }
    531 
    532 static int charmap_cmp(const void *a, const void *b)
    533 {
    534     const struct CharmapEntry *ea = a;
    535     const struct CharmapEntry *eb = b;
    536     if (ea->length != eb->length)
    537         return 0;
    538     for(int i = 0; i < ea->length; i++)
    539         if(ea->unicode[i] != eb->unicode[i])
    540             return 0;
    541     return 1;
    542 }
    543 
    544 static void usage(const char *execName)
    545 {
    546     fprintf(stderr, "Usage: %s CHARMAP INPUT OUTPUT\n", execName);
    547 }
    548 
    549 int main(int argc, char **argv)
    550 {
    551     if (argc != 4)
    552     {
    553         usage(argv[0]);
    554         return 1;
    555     }
    556 
    557     charmap = hashtable_new(charmap_hash, charmap_cmp, 256, sizeof(struct CharmapEntry));
    558 
    559     read_charmap(argv[1]);
    560     convert_file(argv[2], argv[3]);
    561 
    562     hashtable_free(charmap);
    563 
    564     return 0;
    565 }