#define _FILE_OFFSET_BITS 64 #define _LARGEFILE_SOURCE #define _GNU_SOURCE /* for strnlen */ #define HAS_READLINE /* remove if you don't have readline */ //#define DEBUG #include #include #include #include #include #include #ifdef HAS_READLINE #include #endif #include "viewhtml.h" #ifdef DEBUG # define debug(...) fprintf(stderr, __VA_ARGS__) #else # define debug(...) do {} while (0) #endif typedef enum { MDE_Success = 0, MDE_BadSignature, MDE_TruncatedFile, MDE_OutOfMemory, MDE_InternalError, MDE_DecompressionFailed, MDE_InvalidData, MDE_EntryNotFound, } MDError; const char *const mde_string[] = { "Success", "Bad signature", "Truncated file", "Out of memory", "Internal error", "Decompression failed", "Invalid data", "Entry not found", }; typedef struct { uint32_t metadata_checksum; // guess uint32_t unknown2; // 0 uint32_t index_entries; uint32_t unknown4; // 0 uint32_t num_entries; uint32_t unknown6; // 0 uint32_t index_memsize; uint32_t unknown8; // 0 uint32_t index_containersize; // compressed size + 8 uint32_t unknown10; // 0 uint32_t unknown11; // 0x37 uint32_t unknown12; // 0x05 0xEE 0x00 0x9F (checksum?) uint32_t unknown13; // 0x02 0x00 0x00 0x00 uint32_t unknown14; // 0x0B 0x6C 0x09 0x7E } MDHeader; typedef struct { char *last_word; uint32_t unknown1; // 0 uint32_t compressed_size; uint32_t unknown3; // 0 uint32_t uncompressed_size; // uncompressed size? 0xFFFD uint32_t unknown5; // 0 uint32_t unknown6; // 0x93D -- number of entries in block? char *next_word; off_t offset; } MDIndexEntry; typedef struct { uint32_t unknown1; // 0 uint32_t unknown2; // 0x95AC uint32_t unknown3; // 0 uint32_t unknown4; // 0x247809 uint32_t unknown5; // 0 uint32_t disksize; uint32_t unknown7; // 0 uint32_t unknown8; // 0xAC6FBFF7 (checksum?) } MDLengthsHeader; typedef struct { uint32_t unknown1; // or high-order bits in 64-bit number? uint32_t disk_size; uint32_t unknown3; // ditto uint32_t mem_size; } MDLengthsEntry; typedef struct { off_t disk; off_t mem; } MDOffsetsInfo; typedef struct { MDHeader header; MDIndexEntry *index; //size_t length_entries; MDLengthsHeader lengths_header; MDLengthsEntry *lengths; MDOffsetsInfo *offsets; off_t header_end; //off_t lengths_start; off_t articles_start; } MDFile; #define check(retval) do { error = (retval); if (error != MDE_Success) return error; } while (false) static uint64_t byteswap64(uint64_t word) { return (word & 0xFF) << 56 | (word & 0xFF00) << 40 | (word & 0xFF0000) << 24 | (word & 0xFF000000) << 8 | (word & 0xFF00000000) >> 8 | (word & 0xFF0000000000) >> 24 | (word & 0xFF000000000000) >> 40 | (word & 0xFF00000000000000) >> 56; } static void byteswap32_var(uint32_t *word) { *word = (*word & 0xFF) << 24 | (*word & 0xFF00) << 8 | (*word & 0xFF0000) >> 8 | (*word & 0xFF000000) >> 24; } static uint32_t byteswap32(uint32_t word) { return (word & 0xFF) << 24 | (word & 0xFF00) << 8 | (word & 0xFF0000) >> 8 | (word & 0xFF000000) >> 24; } static uint16_t byteswap16(uint16_t word) { return (word & 0xFF) << 8 | (word & 0xFF00) >> 8; } static MDError get16(const uint8_t *data, const size_t len, size_t *offset, uint16_t *word) { if (*offset + sizeof(*word) > len) return MDE_TruncatedFile; *word = byteswap16(*(uint16_t*)(data+*offset)); *offset += sizeof(*word); return MDE_Success; } static MDError get32(const uint8_t *data, const size_t len, size_t *offset, uint32_t *word) { if (*offset + sizeof(*word) > len) return MDE_TruncatedFile; *word = byteswap32(*(uint32_t*)(data+*offset)); *offset += sizeof(*word); return MDE_Success; } static MDError getstring(const uint8_t *data, const size_t len, size_t *offset, size_t stringlen, char **string) { if (*offset + stringlen+1 > len) return MDE_TruncatedFile; if (data[*offset+stringlen] != 0) return MDE_InvalidData; *string = malloc(stringlen+1); if (!*string) return MDE_OutOfMemory; memcpy(*string, (char*)(data+*offset), stringlen+1); *offset += stringlen+1; return MDE_Success; } static int mdstrcmp(const char *sa, const char *sb) { const unsigned char *a = (const unsigned char*)sa; const unsigned char *b = (const unsigned char*)sb; static const char punct[] = " !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; for (; *a && *b; a++, b++) { unsigned char c = tolower(*a); unsigned char d = tolower(*b); // Binary equality (after converting to lowercase) //fprintf(stderr, "%c %c\n", c, d); if (c == d) continue; // Ignore spaces and punctuation while (a[0] && strchr(punct, a[0])) a++; while (b[0] && strchr(punct, b[0])) b++; c = tolower(*a); d = tolower(*b); if (c == d) continue; else return ((int)c) - ((int)d); } // Ignore trailing spaces and punctuation while (a[0] && strchr(punct, a[0])) a++; while (b[0] && strchr(punct, b[0])) b++; return ((int)*a) - ((int)*b); } /* static int sign(int n) { if (n < 0) return -1; if (n > 0) return 1; else return 0; } static int mdstrcmp(const char *a, const char *b) { int i = strcasecmp(a, b); int j = _mdstrcmp(a, b); debug("strcasecmp %3d, _mdstrcmp %3d\n", i, j); if (sign(i) != sign(j)) { debug(" %s\n %s\n", a, b); abort(); } return i; }*/ MDError read_signature(FILE *input) { uint16_t signature; if (fread(&signature, 2, 1, input) != 1) return MDE_BadSignature; if (signature != 0) return MDE_BadSignature; return MDE_Success; } MDError skip_metadata(FILE *input) { uint16_t ucs2char; do { if (fread(&ucs2char, 2, 1, input) != 1) return MDE_TruncatedFile; } while (ucs2char != 0); return MDE_Success; } MDError read_header(FILE *input, MDHeader *header) { if (fread(header, sizeof(MDHeader), 1, input) == 1) { /* Byte-swap */ for (size_t i = 0; i < sizeof(MDHeader)/4; i++) { byteswap32_var(((uint32_t*)header)+i); } return MDE_Success; } return MDE_TruncatedFile; } MDError parse_index_entry(const MDFile *file, uint8_t *data, bool last, size_t *offset, MDIndexEntry *entry) { MDError error; uint16_t namelen; size_t len = file->header.index_memsize; check(get16(data, len, offset, &namelen)); check(getstring(data, len, offset, namelen, &entry->last_word)); //debug("from: %s\n", entry->last_word); check(get32(data, len, offset, &entry->unknown1)); check(get32(data, len, offset, &entry->compressed_size)); check(get32(data, len, offset, &entry->unknown3)); check(get32(data, len, offset, &entry->uncompressed_size)); if (last) { /* Last entry, so skip the next-entry stuff */ entry->unknown5 = entry->unknown6 = 0; entry->next_word = NULL; return MDE_Success; } check(get32(data, len, offset, &entry->unknown5)); check(get32(data, len, offset, &entry->unknown6)); check(get16(data, len, offset, &namelen)); check(getstring(data, len, offset, namelen, &entry->next_word)); //debug("from: %s to: %s\n", entry->last_word, entry->next_word); return MDE_Success; } MDError read_index(FILE *input, MDFile *file) { MDError error; uint8_t *index = malloc(file->header.index_memsize); if (!index) return MDE_OutOfMemory; // Hack uLongf usize = file->header.index_memsize; int csize = file->header.index_containersize - 8; uint8_t *compressed = malloc(csize); if (!compressed) { free(index); return MDE_OutOfMemory; } if (fread(compressed, csize, 1, input) != 1) { free(compressed); free(index); return MDE_TruncatedFile; } //debug("compr crc32: %lx\n", crc32(0, compressed, csize)); //for (int i = 0; i < csize; i++) debug("%hx ", compressed[i]); int zstatus = uncompress(index, &usize, compressed, csize); free(compressed); if (zstatus != Z_OK) { free(index); return MDE_DecompressionFailed; } //file->index_count = byteswap32(*(uint32_t*)(index+4)); debug("unknown value: %d\n", byteswap32(*(uint32_t*)(index+4))); debug("uncompr crc32: %lx\n", crc32(0, index, usize)); debug(" adler32: %lx\n", adler32(0, index, usize)); file->index = malloc((file->header.index_entries)*sizeof(MDIndexEntry)); if (!file->index) { free(index); return MDE_OutOfMemory; } size_t offset = 11; off_t dataptr = ftello(input); for (size_t i = 0; i < file->header.index_entries; i++) { //debug("parsing entry %d. dataptr = %qd\n", i, dataptr); check(parse_index_entry(file, index, (i == file->header.index_entries-1), &offset, &file->index[i])); file->index[i].offset = dataptr+8; dataptr += file->index[i].compressed_size; } return MDE_Success; } MDError get_block_number(const MDFile *file, const char *word, size_t *number) { const char *first_word = ""; size_t i; for (i = 0; i < file->header.index_entries; i++) { /*printf("test[%d] %s in (%s %s) = (%d %d)\n", i, word, file->index[i].last_word, file->index[i].next_word, mdstrcmp(word, file->index[i].last_word), mdstrcmp(word, file->index[i].next_word));*/ // debug("compare %20s < %20s < %20s\n", first_word, word, file->index[i].last_word); if (mdstrcmp(word, first_word) < 0) { fprintf(stderr, "can't get block number\n"); return MDE_EntryNotFound; } if (mdstrcmp(word, file->index[i].last_word) <= 0) { debug("matching block %d\n", i); *number = i; return MDE_Success; } first_word = file->index[i].next_word; } return MDE_EntryNotFound; } MDError get_block(FILE *input, const MDIndexEntry *entry, uint8_t **block, size_t *blocklen) { uLongf usize = entry->uncompressed_size; fseeko(input, entry->offset, SEEK_SET); debug(" unkn1: %8d\n" " compressed: %8d\n" " unkn3: %8d\n" " uncompressed: %8d\n" " unkn5: %8d\n" " unkn6: %8d\n", // maybe the difference of the first characters between the first/last word entry->unknown1, entry->compressed_size, entry->unknown3, entry->uncompressed_size, entry->unknown5, entry->unknown6); uint8_t *compressed = malloc(entry->compressed_size); *block = malloc(usize); if (!compressed || !*block) { free(*block); free(compressed); return MDE_OutOfMemory; } debug("reading from %qd (%d bytes)\n", entry->offset, entry->compressed_size); if (fread(compressed, entry->compressed_size, 1, input) != 1) { free(compressed); return MDE_TruncatedFile; } int zstatus = uncompress(*block, &usize, compressed, entry->compressed_size); free(compressed); if (zstatus != Z_OK) { free(*block); return MDE_DecompressionFailed; } if ((*block)[usize-1] != 0) { free(*block); return MDE_InvalidData; } *blocklen = usize; return MDE_Success; } MDError find_offset_in_block(const MDFile *file, const uint8_t *block, size_t blocklen, const char *name, off_t *offset) { size_t blockpos = 0; /* Parse entries */ while (blockpos < blocklen) { *offset = byteswap64(*(const uint64_t*)(block + blockpos)); //printf("compare %s %s\n", name, (const char*)(block + blockpos + 8)); if (!mdstrcmp((const char*)(block + blockpos + 8), name)) return MDE_Success; /*if (mdstrcmp((const char*)(block + blockpos + 8), name) > 0) { for (const char *c = (char*)(block + blockpos + 8); *c; c++) debug("%hhx[%c] ", *c, *c); debug("\n"); return MDE_EntryNotFound; }*/ blockpos += 8 + strlen((const char*)(block + blockpos + 8)) + 1; } //debug("can't get offset in block\n"); return MDE_EntryNotFound; } MDError get_length_entry_number(const MDFile *file, off_t offset, size_t *entry_number) { for (size_t i = 0; i < file->lengths_header.disksize/sizeof(MDLengthsEntry); i++) { off_t start = file->offsets[i].mem; if ((offset >= start) && (offset < start + file->lengths[i].mem_size)) { *entry_number = i; return MDE_Success; } } fprintf(stderr, "Could not look up offset in lengths table\n"); return MDE_EntryNotFound; } MDError read_lengthtables(FILE *input, MDFile *file) { off_t blocks_end = file->index[file->header.index_entries-1].offset + file->index[file->header.index_entries-1].compressed_size-8; /* Read header */ if ((fseeko(input, blocks_end, SEEK_SET) == -1) || (fread(&file->lengths_header, sizeof(MDLengthsHeader), 1, input) != 1)) { return MDE_TruncatedFile; } /* Byte-swap */ for (size_t i = 0; i < sizeof(MDLengthsHeader)/4; i++) { byteswap32_var(((uint32_t*)&file->lengths_header)+i); } //file->lengths_start = blocks_end + sizeof(MDLengthsHeader); file->articles_start = blocks_end + sizeof(MDLengthsHeader) + file->lengths_header.disksize; size_t count = file->lengths_header.disksize/sizeof(MDLengthsEntry); /* Read all length entries */ //if (lengths_header.disksize != sizeof(MDLengthsEntry)*length_header) file->lengths = malloc(file->lengths_header.disksize); file->offsets = malloc(count*sizeof(MDOffsetsInfo)); if (!file->lengths || !file->offsets) { return MDE_OutOfMemory; } if (fread(file->lengths, file->lengths_header.disksize, 1, input) != 1) { free(file->lengths); return MDE_TruncatedFile; } off_t disk_offset = file->articles_start; off_t mem_offset = 0; for (size_t i = 0; i < count; i++) { /* Byte-swap */ byteswap32_var(&file->lengths[i].mem_size); byteswap32_var(&file->lengths[i].disk_size); /* Calculate offsets */ file->offsets[i].disk = disk_offset + 8; file->offsets[i].mem = mem_offset; disk_offset += file->lengths[i].disk_size; mem_offset += file->lengths[i].mem_size; } return MDE_Success; } #if 0 // FIXME behöver läsa in alla längder för att få fram filpositioner MDError get_bookshelf_lengths(FILE *input, const MDFile *file, size_t bookshelf_number, MDLengthsEntry *lengths) { if ((fseeko(input, file->lengths_start + bookshelf_number*sizeof(MDLengthsEntry), SEEK_SET) == -1) || (fread(lengths, sizeof(MDLengthsEntry), 1, input) != 1)) { return MDE_TruncatedFile; } /* Byte-swap */ for (size_t i = 0; i < sizeof(MDLengthsEntry)/4; i++) { byteswap32_var(((uint32_t*)lengths)+i); } return MDE_Success; } #endif MDError get_article(FILE *input, const MDFile *file, const char *name, char **article) { MDError error; uint8_t *block; size_t blocklen; size_t blocknum; MDLengthsEntry *lengths; MDOffsetsInfo *offsets; /* Locate index block */ check(get_block_number(file, name, &blocknum)); debug("blocknum = %d\n", blocknum); check(get_block(input, &file->index[blocknum], &block, &blocklen)); //fwrite(block, file->index[0].uncompressed_size, 1, stdout); /* Get article offset/length */ size_t length_entry_number; off_t article_offs; check(find_offset_in_block(file, block, blocklen, name, &article_offs)); check(get_length_entry_number(file, article_offs, &length_entry_number)); lengths = &file->lengths[length_entry_number]; offsets = &file->offsets[length_entry_number]; debug("whole_mem_offset = %qd, disk_offset[%d] = %qd (articles_start is %qd)\n", article_offs, length_entry_number, offsets->disk, file->articles_start); debug("lengths: mem %d disk: %d\n", lengths->mem_size, lengths->disk_size); /*printf("article_offs: %1$qx = %1$qd (articles_start is %2$qd)\n", article_offs + file->articles_start, file->articles_start); printf(" + header: %1$qx = %1$qd\n", article_offs + file->header_end); printf(" + index: %1$qx = %1$qd\n", article_offs + file->header_end + file->header.index_containersize); printf(" + blocks: %1$qx = %1$qd\n", article_offs + file->index[file->header.index_entries-1].offset + file->index[file->header.index_entries-1].compressed_size-8); printf(" relative: %1$qx = %1$qd\n", article_offs + file->index[blocknum].offset); printf(" blocks end: %1$qx = %1$qd\n", file->index[file->header.index_entries-1].offset + file->index[file->header.index_entries-1].compressed_size-8);*/ if (fseeko(input, offsets->disk, SEEK_SET) == -1) return MDE_TruncatedFile; //lengths->disk_size += 120000; //lengths->mem_size += 1200; uint8_t *compressed = malloc(lengths->disk_size); uint8_t *articles = malloc(lengths->mem_size); if (!compressed || !articles) { free(articles); free(compressed); return MDE_OutOfMemory; } debug("reading articles (%d bytes)\n", lengths->disk_size); if (fread(compressed, lengths->disk_size, 1, input) != 1) { free(compressed); return MDE_TruncatedFile; } //debug("disksize = %d memsize = %d\n", lengths->disk_size, lengths->mem_size); uLongf usize = lengths->mem_size; int zstatus = uncompress(articles, &usize, compressed, lengths->disk_size); free(compressed); if (zstatus != Z_OK) { fprintf(stderr, "zstatus = %d\n", zstatus); free(articles); return MDE_DecompressionFailed; } debug("lengths: mem %d\n", usize); /* Extract the right article */ //article_offs size_t internal_offset = article_offs - offsets->mem; const char *article_start = (char*)&articles[internal_offset]; debug("articles = %p + offs (%d) = %p\n", articles, internal_offset, &articles[internal_offset]); size_t article_length = strnlen(article_start, /*lengths->mem_size*/ usize - internal_offset); *article = malloc(article_length+1); memcpy(*article, article_start, article_length); (*article)[article_length] = '\0'; return MDE_Success; } MDError init_file(FILE *input, MDFile *file) { MDError error; check(read_signature(input)); check(skip_metadata(input)); check(read_header(input, &file->header)); file->header_end = ftello(input); debug(" meta_csum?: %.8x\n" " unkn2: %8d\n" "index entries: %8d\n" " unkn4: %8d\n" " num entries: %8d\n" " unkn6: %8d\n" " index_msize: %8d\n" " unkn8: %8d\n" " index_csize: %8d\n" " unkn10: %8d\n" " unkn11: %8d\n" " unkn12: %.8x\n", file->header.metadata_checksum, file->header.unknown2, file->header.index_entries, file->header.unknown4, file->header.num_entries, file->header.unknown6, file->header.index_memsize, file->header.unknown8, file->header.index_containersize, file->header.unknown10, file->header.unknown11, file->header.unknown12); debug(" header end: %8qd\n", file->header_end); check(read_index(input, file)); check(read_lengthtables(input, file)); return MDE_Success; } MDError list_words(FILE *input, const MDFile *file, const char *beginning) { MDError error; size_t first_block, last_block; const int wordlen = strlen(beginning); char last_word[wordlen+2]; check(get_block_number(file, beginning, &first_block)); memcpy(last_word, beginning, wordlen); last_word[wordlen] = '\xFF'; last_word[wordlen+1] = '\0'; check(get_block_number(file, last_word, &last_block)); if (last_block - first_block > 2) { printf("Too many words begin with those characters.\n"); return MDE_OutOfMemory; } size_t numNonMonotonic = 0; for (size_t blocknum = first_block; blocknum <= last_block; blocknum++) { uint8_t *block; size_t blocklen; check(get_block(input, &file->index[blocknum], &block, &blocklen)); size_t blockpos = 0; /* Read words from the block */ const char *previous_word = NULL; while (blockpos < blocklen) { const char *current_word = (const char*)(block + blockpos + 8); if (previous_word) { int derivative = mdstrcmp(current_word, previous_word); if (derivative < 0) { fprintf(stderr, "[warning] string value wasn't monotonic! " "derivative: %d\n", derivative); numNonMonotonic++; } } //printf("compare %s %s\n", name, (const char*)(block + blockpos + 8)); if (mdstrcmp(current_word, beginning) >= 0 && mdstrcmp(current_word, last_word) <= 0) { /* Match */ printf(" %s\n", current_word); } /*if (mdstrcmp((const char*)(block + blockpos + 8), beginning) > 0) { for (const char *c = (char*)(block + blockpos + 8); *c; c++) debug("%hhx[%c] ", *c, *c); debug("\n"); return MDE_EntryNotFound; }*/ previous_word = current_word; blockpos += 8 + strlen((const char*)(block + blockpos + 8)) + 1; } } if (numNonMonotonic) { fprintf(stderr, "[warning] num non-monotonic: %d\n", numNonMonotonic); } return MDE_Success; } #define HIST_SIZE 50 void show_articles(FILE *input, const MDFile *file) { size_t hist_id = 0; char word_hist[HIST_SIZE][1024]; memset(word_hist, 0, sizeof(word_hist)); while (true) { /* Ask for the article name */ #ifdef HAS_READLINE char *word = readline("word> "); if (!word) break; #else char word[1024]; printf("word> "); if (!fgets(word, sizeof(word), stdin)) { break; } if (word[0] == '\0') break; for (char *c = word; *c != '\0'; c++) { if (*c == '\n' || *c == '\r') { *c = '\0'; break; } else if (*c >= 'A' && *c <= 'Z') { *c |= 0x20; } } #endif if (word[0] == '\0') continue; if (word[0] == '!') { /* Search */ list_words(input, file, &word[1]); continue; } else if (word[0] == '/') { if (strcmp(word, "/hist") == 0) { /* Show history list */ for (size_t i = 0; i < HIST_SIZE; i++) { if (word_hist[i][0] != '\0') { printf("%2d) %.100s\n", i, word_hist[i]); } } continue; } else if (strncmp(word, "/hist ", 6) == 0) { /* Go to article in history list */ size_t i = atoi(&word[6]); if (i >= HIST_SIZE || word_hist[i][0] == '\0') { printf("no such entry"); continue; } memcpy(word, word_hist[i], 1024); } else { printf("invalid command\n"); continue; } } else { /* Store word in history */ memcpy(word_hist[hist_id++], word, 1024); if (hist_id >= HIST_SIZE) hist_id = 0; } /* Fetch article */ char *article; MDError error = get_article(input, file, word, &article); if (error != MDE_Success) { printf("%s\n", mde_string[error]); continue; } /* Display article */ //puts(article); view_html(article); #ifdef HAS_READLINE free(word); #endif } printf("\n"); } int main(int argc, char **argv) { int status = 0; for (int i = 1; i < argc; i++) { MDFile file; FILE *input = fopen(argv[i], "rb"); if (!input) { perror(argv[i]); status = 1; continue; } MDError error = init_file(input, &file); if (error != MDE_Success) { fprintf(stderr, "%s: %s\n", argv[i], mde_string[error]); status = 1; } show_articles(input, &file); fclose(input); } return status; }