store dictionary compressed

2022-01-10 10:31:05 -08:00
parent 8657f3c13d
commit 183e2fe7ac
4 changed files with 101 additions and 35 deletions
@@ -1,2 +1,4 @@
 *.exe
 data/BNC/*
+data/dictionary*
+!data/dictionary.txt
@@ -12,6 +12,7 @@ bin           = @["passphrase"]

 requires "nim >= 1.0"
 requires "nimcrypto >= 0.4.8"
+requires "zippy >= 0.7.3"


 # Tasks n scripts
@@ -0,0 +1,86 @@
+import std/strutils
+import zippy
+
+
+type
+  Dictionary* = object
+    words: string
+    offsets: seq[uint32]
+
+
+proc `[]`*(d: Dictionary, i: Natural): string =
+  # last word has no following start index, so we have to fake it
+  # also strings are indexed with ints
+  let slice = 
+    if i == d.offsets.high:
+      d.offsets[i].int .. d.words.high
+    else:
+      d.offsets[i].int ..< d.offsets[i + 1].int
+
+  result = d.words[slice]
+
+
+proc len*(d: Dictionary): int =
+  result = d.offsets.len
+
+
+proc addU32(s: var seq[uint8], i: uint32) =
+  for offset in 0..3:
+    let b = cast[uint8](i shr (offset * 8))
+    s.add(b)
+
+
+proc getU32(s: seq[uint8]): uint32 =
+  for offset in 0..3:
+    result = result or (cast[uint32](s[offset]) shl (offset * 8))
+
+
+proc pack*(d: Dictionary): seq[uint8] =
+  let compressed = cast[seq[uint8]](d.words)
+  let lenWords = compressed.len
+  let lenOffsets = d.offsets.len * 4
+  # 8 extra bytes for the length specifiers
+  var buff = newSeqOfCap[uint8](lenWords + lenOffsets + 8)
+  buff.addU32(lenWords.uint32)
+  buff.add(compressed)
+  buff.addU32(lenOffsets.uint32)
+  for offset in d.offsets:
+    buff.addU32(offset)
+  result = buff.compress(dataFormat = dfGzip)
+
+
+proc unpack*[T: seq[uint8]|string](p: T): Dictionary =
+  when T is string:
+    let buff = cast[seq[uint8]](p.uncompress(dataFormat = dfGzip))
+  else:
+    let buff = p.uncompress(dataFormat = dfGzip)
+  
+  let lenWords = buff.getU32()
+  let nextSection = lenWords + 4
+  let words = cast[string](buff[4 ..< nextSection])
+
+  let numOffsets = buff[nextSection ..< (nextSection + 4)].getU32() div 4
+  var offsets = newSeqOfCap[uint32](numOffsets)
+  for i in 0 ..< numOffsets:
+    let idx = nextSection + 4 + (i * 4)
+    let offset = buff[idx ..< idx + 4].getU32()
+    offsets.add(offset)
+
+  result = Dictionary(words: words, offsets: offsets)
+
+
+proc loadWords*(): Dictionary =
+  var i: uint32
+  for word in readFile("../data/dictionary.txt").strip().splitLines():
+    result.words.add(word)
+    result.offsets.add(i)
+    i += word.len.uint32
+
+
+when isMainModule:
+  echo "Loading words..."
+  let dictionary = loadWords()
+  echo "Packing dictionary..."
+  let packed = dictionary.pack()
+  discard open("../data/dictionary.pack", fmWrite).writeBytes(packed, 0, packed.len)
+  echo "Dictionary packed."
@@ -1,38 +1,12 @@
 import std/[os, strutils]
 import nimcrypto/sysrand
+import dictionary


-type Dictionary = object
-  words: string
-  offsets: seq[uint32]
+const packed = staticRead("../data/dictionary.pack")


-proc `[]`(d: Dictionary, i: Natural): string =
-  # last word has no following start index, so we have to fake it
-  # also strings are indexed with ints
-  let slice = if i == d.offsets.high:
-    d.offsets[i].int .. d.words.high
-  else:
-    d.offsets[i].int ..< d.offsets[i + 1].int
-
-  result = d.words[slice]
-
-
-proc len(d: Dictionary): int =
-  result = d.offsets.len
-
-
-proc loadWords(): Dictionary =
-  for word in staticRead("../data/dictionary.txt").strip().splitLines():
-    let startIdx = result.words.len.uint32
-    result.offsets.add(startIdx)
-    result.words.add(word)
-
-
-const dict = loadWords()
-
-
-proc genPassphrase(length, dictSize: int): string =
+proc genPassphrase(dict: Dictionary, length, dictSize: int): string =
  var rands = newSeq[uint64](length)
  discard randomBytes(rands)

@@ -48,7 +22,8 @@ const help = """Usage:

 Defaults to length of 4 and dictionary size of 25,000."""

-proc parseInput(): (int, int) =
+
+proc parseInput(dictLen: int): (int, int) =
  let params = commandLineParams()
  if "-h" in params or "--help" in params:
    echo help
@@ -67,8 +42,8 @@ proc parseInput(): (int, int) =
  if params.len > 1:
    try:
      dictSize = parseInt(params[1])
-      if dictSize < 100 or dictSize > dict.len:
-        quit("Dictionary size must be between 100 and " & $dict.len, 1)
+      if dictSize < 100 or dictSize > dictLen:
+        quit("Dictionary size must be between 100 and " & $dictLen, 1)
    except ValueError:
      quit('"' & params[1] & "\" is not a valid dictionary size.", 1)

@@ -76,5 +51,7 @@ proc parseInput(): (int, int) =


 when isMainModule:
-  let (length, dictSize) = parseInput()
-  echo genPassphrase(length, dictSize)
+  let dict = packed.unpack()
+  echo dict.len
+  let (length, dictSize) = parseInput(dict.len)
+  echo genPassphrase(dict, length, dictSize)