switch to fixed-width storage for word list

2022-01-10 11:20:30 -08:00
parent 183e2fe7ac
commit e8d2f16c9e
3 changed files with 35 additions and 53 deletions
--- a/passphrase.nimble
+++ b/passphrase.nimble
@@ -33,6 +33,13 @@ task(dictionary, "Generate dictionary from BNC XML files"):
  echo output.strip().splitlines()[^1]
 task(pack, "Pack dictionary into fixed-width file and compress"):
  echo "Packing dictionary"
  echo runCmd("nim c --run src/dictionary.nim data/dictionary.txt data/dictionary.pack")
 before(build):
  if not fileExists("data/dictionary.txt"):
    dictionaryTask()
  if not fileExists("data/dictionary.pack"):
    packTask()
--- a/src/dictionary.nim
+++ b/src/dictionary.nim
@@ -5,82 +5,58 @@ import zippy
 type
  Dictionary* = object
    words: string
-    offsets: seq[uint32]
+    width: uint32
 proc `[]`*(d: Dictionary, i: Natural): string =
-  # last word has no following start index, so we have to fake it
+  let start = i.uint32 * d.width
-  # also strings are indexed with ints
+  d.words[start ..< start + d.width]
  let slice = 
    if i == d.offsets.high:
      d.offsets[i].int .. d.words.high
    else:
      d.offsets[i].int ..< d.offsets[i + 1].int
  result = d.words[slice]
 proc len*(d: Dictionary): int =
-  result = d.offsets.len
+  result = d.words.len div d.width.int
-proc addU32(s: var seq[uint8], i: uint32) =
+proc addU32(s: var string, i: uint32) =
  for offset in 0..3:
-    let b = cast[uint8](i shr (offset * 8))
+    let b = cast[char](i shr (offset * 8))
    s.add(b)
-proc getU32(s: seq[uint8]): uint32 =
+proc getU32(s: string): uint32 =
  for offset in 0..3:
    result = result or (cast[uint32](s[offset]) shl (offset * 8))
-proc pack*(d: Dictionary): seq[uint8] =
+proc pack*(d: Dictionary): string =
-  let compressed = cast[seq[uint8]](d.words)
+  var data: string
-  let lenWords = compressed.len
+  data.addU32(d.width)
-  let lenOffsets = d.offsets.len * 4
+  data.add(d.words)
-  # 8 extra bytes for the length specifiers
+  data.compress(dataFormat = dfGzip)
  var buff = newSeqOfCap[uint8](lenWords + lenOffsets + 8)
  buff.addU32(lenWords.uint32)
  buff.add(compressed)
  buff.addU32(lenOffsets.uint32)
  for offset in d.offsets:
    buff.addU32(offset)
  result = buff.compress(dataFormat = dfGzip)
-proc unpack*[T: seq[uint8]|string](p: T): Dictionary =
+proc unpack*(p: string): Dictionary =
-  when T is string:
+  let data = p.uncompress(dataFormat = dfGzip)
-    let buff = cast[seq[uint8]](p.uncompress(dataFormat = dfGzip))
+  result.width = data.getU32()
-  else:
+  result.words = data[4..^1]
    let buff = p.uncompress(dataFormat = dfGzip)
  let lenWords = buff.getU32()
  let nextSection = lenWords + 4
  let words = cast[string](buff[4 ..< nextSection])
  let numOffsets = buff[nextSection ..< (nextSection + 4)].getU32() div 4
  var offsets = newSeqOfCap[uint32](numOffsets)
  for i in 0 ..< numOffsets:
    let idx = nextSection + 4 + (i * 4)
    let offset = buff[idx ..< idx + 4].getU32()
    offsets.add(offset)
  result = Dictionary(words: words, offsets: offsets)
-proc loadWords*(): Dictionary =
+proc loadWords*(path: string): Dictionary =
-  var i: uint32
+  result.width = 25
-  for word in readFile("../data/dictionary.txt").strip().splitLines():
+  for word in readFile(path).strip().splitLines():
    if word.len > 25:
      continue
    result.words.add(word)
-    result.offsets.add(i)
+    for _ in 0..<(25 - word.len):
-    i += word.len.uint32
+      result.words.add(' ')
 when isMainModule:
  import std/os
  echo "Loading words..."
-  let dictionary = loadWords()
+  let dictionary = loadWords(paramStr(1))
  echo "Packing dictionary..."
  let packed = dictionary.pack()
-  discard open("../data/dictionary.pack", fmWrite).writeBytes(packed, 0, packed.len)
+  writeFile(paramStr(2), packed)
  echo "Dictionary packed."
--- a/src/passphrase.nim
+++ b/src/passphrase.nim
@@ -13,7 +13,7 @@ proc genPassphrase(dict: Dictionary, length, dictSize: int): string =
  var words: seq[string]
  for r in rands:
    let i = r mod dictSize.uint64
-    words.add(dict[i])
+    words.add(dict[i].strip())
  result = words.join(" ")
@@ -52,6 +52,5 @@ proc parseInput(dictLen: int): (int, int) =
 when isMainModule:
  let dict = packed.unpack()
  echo dict.len
  let (length, dictSize) = parseInput(dict.len)
  echo genPassphrase(dict, length, dictSize)