From e8d2f16c9eae3cca3926b212107c2eaf7abf6106 Mon Sep 17 00:00:00 2001 From: Joseph Montanaro Date: Mon, 10 Jan 2022 11:20:30 -0800 Subject: [PATCH] switch to fixed-width storage for word list --- passphrase.nimble | 7 +++++ src/dictionary.nim | 78 ++++++++++++++++------------------------------ src/passphrase.nim | 3 +- 3 files changed, 35 insertions(+), 53 deletions(-) diff --git a/passphrase.nimble b/passphrase.nimble index dd81c74..93cb60a 100644 --- a/passphrase.nimble +++ b/passphrase.nimble @@ -33,6 +33,13 @@ task(dictionary, "Generate dictionary from BNC XML files"): echo output.strip().splitlines()[^1] +task(pack, "Pack dictionary into fixed-width file and compress"): + echo "Packing dictionary" + echo runCmd("nim c --run src/dictionary.nim data/dictionary.txt data/dictionary.pack") + + before(build): if not fileExists("data/dictionary.txt"): dictionaryTask() + if not fileExists("data/dictionary.pack"): + packTask() diff --git a/src/dictionary.nim b/src/dictionary.nim index 57db2fb..1cd8edb 100644 --- a/src/dictionary.nim +++ b/src/dictionary.nim @@ -5,82 +5,58 @@ import zippy type Dictionary* = object words: string - offsets: seq[uint32] + width: uint32 proc `[]`*(d: Dictionary, i: Natural): string = - # last word has no following start index, so we have to fake it - # also strings are indexed with ints - let slice = - if i == d.offsets.high: - d.offsets[i].int .. d.words.high - else: - d.offsets[i].int ..< d.offsets[i + 1].int - - result = d.words[slice] + let start = i.uint32 * d.width + d.words[start ..< start + d.width] proc len*(d: Dictionary): int = - result = d.offsets.len + result = d.words.len div d.width.int -proc addU32(s: var seq[uint8], i: uint32) = +proc addU32(s: var string, i: uint32) = for offset in 0..3: - let b = cast[uint8](i shr (offset * 8)) + let b = cast[char](i shr (offset * 8)) s.add(b) -proc getU32(s: seq[uint8]): uint32 = +proc getU32(s: string): uint32 = for offset in 0..3: result = result or (cast[uint32](s[offset]) shl (offset * 8)) -proc pack*(d: Dictionary): seq[uint8] = - let compressed = cast[seq[uint8]](d.words) - let lenWords = compressed.len - let lenOffsets = d.offsets.len * 4 - # 8 extra bytes for the length specifiers - var buff = newSeqOfCap[uint8](lenWords + lenOffsets + 8) - buff.addU32(lenWords.uint32) - buff.add(compressed) - buff.addU32(lenOffsets.uint32) - for offset in d.offsets: - buff.addU32(offset) - result = buff.compress(dataFormat = dfGzip) +proc pack*(d: Dictionary): string = + var data: string + data.addU32(d.width) + data.add(d.words) + data.compress(dataFormat = dfGzip) -proc unpack*[T: seq[uint8]|string](p: T): Dictionary = - when T is string: - let buff = cast[seq[uint8]](p.uncompress(dataFormat = dfGzip)) - else: - let buff = p.uncompress(dataFormat = dfGzip) - - let lenWords = buff.getU32() - let nextSection = lenWords + 4 - let words = cast[string](buff[4 ..< nextSection]) - - let numOffsets = buff[nextSection ..< (nextSection + 4)].getU32() div 4 - var offsets = newSeqOfCap[uint32](numOffsets) - for i in 0 ..< numOffsets: - let idx = nextSection + 4 + (i * 4) - let offset = buff[idx ..< idx + 4].getU32() - offsets.add(offset) - - result = Dictionary(words: words, offsets: offsets) +proc unpack*(p: string): Dictionary = + let data = p.uncompress(dataFormat = dfGzip) + result.width = data.getU32() + result.words = data[4..^1] -proc loadWords*(): Dictionary = - var i: uint32 - for word in readFile("../data/dictionary.txt").strip().splitLines(): +proc loadWords*(path: string): Dictionary = + result.width = 25 + for word in readFile(path).strip().splitLines(): + if word.len > 25: + continue + result.words.add(word) - result.offsets.add(i) - i += word.len.uint32 + for _ in 0..<(25 - word.len): + result.words.add(' ') when isMainModule: + import std/os echo "Loading words..." - let dictionary = loadWords() + let dictionary = loadWords(paramStr(1)) echo "Packing dictionary..." let packed = dictionary.pack() - discard open("../data/dictionary.pack", fmWrite).writeBytes(packed, 0, packed.len) + writeFile(paramStr(2), packed) echo "Dictionary packed." diff --git a/src/passphrase.nim b/src/passphrase.nim index d784ba8..a6f7daf 100644 --- a/src/passphrase.nim +++ b/src/passphrase.nim @@ -13,7 +13,7 @@ proc genPassphrase(dict: Dictionary, length, dictSize: int): string = var words: seq[string] for r in rands: let i = r mod dictSize.uint64 - words.add(dict[i]) + words.add(dict[i].strip()) result = words.join(" ") @@ -52,6 +52,5 @@ proc parseInput(dictLen: int): (int, int) = when isMainModule: let dict = packed.unpack() - echo dict.len let (length, dictSize) = parseInput(dict.len) echo genPassphrase(dict, length, dictSize)