initial commit

2021-07-26 06:48:01 -07:00
commit b388552a27
9 changed files with 128411 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 *.exe
 BNC/*
--- a/counts.txt
+++ b/counts.txt
--- a/dictionary.txt
+++ b/dictionary.txt
--- a/passphrase.nimble
+++ b/passphrase.nimble
@@ -0,0 +1,50 @@
 # Package
 version       = "0.1.0"
 author        = "Joseph Montanaro"
 description   = "Passphrase generator and dictionary builder"
 license       = "none"
 srcDir        = "src"
 bin           = @["passphrase"]
 # Dependencies
 requires "nim >= 1.0"
 requires "nimcrypto >= 0.4.8"
 # Tasks n scripts
 import strutils
 proc runCmd(command: string, input = "", cache = ""): string =
  let (output, exitCode) = gorgeEx(command, input, cache)
  if exitCode != 0:
    echo "Command failed: " & command
    echo "Output:\n  ", output.splitLines().join("\n  ")
    quit(exitCode)
  result = output
 proc pathExists(filename: string): bool =
  let (output, code) = gorgeEx("ls " & filename)
  return code == 0
 task(dictionary, "Generate dictionary from BNC XML files"):
  if not pathExists("BNC"):
    quit("Cannot build dictionary: BNC data files not found.", 1)
  echo "Building dictionary"
  echo runCmd("nim c --run --threads:on -d:release -d:lto src/process.nim")
 task(showexec, "this is a test"):
  let (output, exitCode) = gorgeEx("ls BNC")
  echo output
  echo exitCode
 before(build):
  if not pathExists("src/dictionary.txt"):
    dictionaryTask()
--- a/src/counts.txt
+++ b/src/counts.txt
--- a/src/dictionary.txt
+++ b/src/dictionary.txt
--- a/src/passphrase.nim
+++ b/src/passphrase.nim
@@ -0,0 +1,83 @@
 import std/[os, strutils]
 import nimcrypto/sysrand
 type Dictionary = object
  words: string
  offsets: seq[uint32]
 proc `[]`(d: Dictionary, i: Natural): string =
  # last word has no following start index, so we have to fake it
  # also strings are indexed with ints
  let slice = if i == d.offsets.high:
    d.offsets[i].int .. d.words.high
  else:
    d.offsets[i].int ..< d.offsets[i + 1].int
  result = d.words[slice]
 proc len(d: Dictionary): int =
  result = d.offsets.len
 proc loadWords(): Dictionary =
  for word in staticRead("dictionary.txt").strip().splitLines():
    let startIdx = result.words.len.uint32
    result.offsets.add(startIdx)
    result.words.add(word)
 const dict = loadWords()
 proc genPassphrase(length, dictSize: int): string =
  if dictSize < 100 or dictSize > dict.len:
    quit("Dictionary size must be between 100 and " & $dict.len, 1)
  var rands = newSeq[uint64](length)
  discard randomBytes(rands)
  var words: seq[string]
  for r in rands:
    let i = r mod dictSize.uint64
    words.add(dict[i])
  result = words.join(" ")
 const help = """Usage:
  passphrase [LENGTH] [DICTSIZE]
 Defaults to length of 4 and dictionary size of 25,000."""
 proc parseInput(): (int, int) =
  let params = commandLineParams()
  if "-h" in params or "--help" in params:
    echo help
    quit(0)
  var
    length = 4
    dictSize = 25_000
  if params.len > 0:
    try:
      length = parseInt(params[0])
    except ValueError:
      quit(params[0] & " is not a valid passphrase length.", 1)
  if params.len > 1:
    try:
      dictSize = parseInt(params[1])
      if dictSize < 100 or dictSize > dict.len:
        quit("Dictionary size must be between 100 and " & $dict.len, 1)
    except ValueError:
      quit(params[1] & " is not a valid dictionary size.", 1)
  result = (length, dictSize)
 when isMainModule:
  let (length, dictSize) = parseInput()
  echo genPassphrase(length, dictSize)
--- a/src/process.nim
+++ b/src/process.nim
@@ -0,0 +1,72 @@
 import std/[os, cpuinfo, times, monotimes]
 import std/[streams, strutils, tables]
 import scanxml
 proc save(wordCounts: CountTable; dictName, countName: string) =
  let dictFile = openFileStream(dictName, fmWrite)
  let countFile = openFileStream(countName, fmWrite)
  for word, count in wordCounts:
    if count >= 3:
      dictFile.writeLine(word)
      countFile.writeLine($count & " " & word)
    else:
      break
 var 
  threadResults: Channel[CountTable[string]]
  progress: Channel[int]
 threadResults.open()
 progress.open()
 proc processFiles(filenames: seq[string]) =
  var counts: CountTable[string]
  for file in filenames:
    for word in iterWords(file):
      counts.inc(word)
    progress.send(1)
  threadResults.send(counts)
 when isMainModule:
  let start = getMonoTime()
  let basePath = r"../BNC/2554/download/Texts/"
  var paths: seq[string]
  for path in walkDirRec(basePath):
    if path.endsWith(".xml"):
      paths.add(path)
  let numThreads = countProcessors()
  var threads = newSeq[Thread[seq[string]]](numThreads)
  var lastIdx = 0
  for i, t in threads.mpairs:
    var chunksize = paths.len div numThreads
    if i < paths.len mod numThreads:
      chunksize += 1
    let newIdx = lastIdx + chunksize
    let chunk = paths[lastIdx ..< newIdx]
    lastIdx = newIdx
    createThread(t, processFiles, chunk)
  var processed = 0
  for i in 0 .. paths.high:
    processed += progress.recv()
    stdout.write("Processing files: " & $processed & "\r")
    stdout.flushFile
  echo ""
  var counts: CountTable[string]
  for i in 0 .. threads.high:
    let subCounts = threadResults.recv()
    for word, count in subCounts:
      counts.inc(word, count)
  counts.sort()
  save(counts, "dictionary.txt", "counts.txt")
  echo "Done. Finished in ", (getMonoTime() - start).inMilliseconds.float / 1000, " seconds."
--- a/src/scanxml.nim
+++ b/src/scanxml.nim
@@ -0,0 +1,74 @@
 import std/[streams, options, parsexml]
 const validCodes = [
  "AJ0", "AJC", "AJS", "AVQ", "CJS", "DT0", "DTQ", "NN0", "NN1", "NN2", 
  "PNI", "PNQ", "PNX", "VVB", "VVD", "VVG", "VVI", "VVN", "VVZ"
 ]
 type Word = object
  code: string
  headword: string
 proc isValid(headword: string): bool =
  for c in headword:
    if ord(c) > 127:
      return false # who cares about non-ascii anyway, it's hard to type
    case c:
      of '1', '2', '3', '4', '5', '6', '7', '8', '9', '0':
        return false
      of '$', '-', '\'', '/', ',', '.', '%':
        return false
      of '(', ')', '[', ']', '{', '}':
        return false
      else: 
        discard
  return true
 proc getWordAttrs(x: var XmlParser): Option[Word] =
  var w: Word
  x.next()
  while x.kind != xmlElementClose and x.kind != xmlElementOpen:
    if x.kind == xmlAttribute:
      # echo "found an attribute: ", x.attrKey
      if x.attrKey == "c5":
        w.code = x.attrValue
      if x.attrKey == "hw":
        w.headword = x.attrValue
      if w.code.len > 0 and w.headword.len > 0:
        return some(w)
    x.next()
 iterator iterWords*(filename: string): string =
  var file: FileStream
  try:
    file = openFileStream(filename)
  except IOError as e:
    echo "Failed to open file: ", filename
    echo "Message: ", e.msg
  var x: XmlParser
  x.open(file, filename)
  x.next()
  while x.kind != xmlEof:
    x.next()
    if x.kind == xmlElementOpen and x.elementName == "w":
      let res = x.getWordAttrs()
      if res.isSome:
        let word = res.get()
        if word.code in validCodes and isValid(word.headword):
          yield word.headword
  file.close()
 when isMainModule:
  let path = r"C:\Users\Joe\Documents\Code\words\BNC\2554\download\Texts\A\A0\A00.xml"
  var count = 0
  for word in iterWords(path):
    inc count
  echo "words found: ", count