initial commit

This commit is contained in:
Joseph Montanaro 2021-07-26 06:48:01 -07:00
commit b388552a27
9 changed files with 128411 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*.exe
BNC/*

0
counts.txt Normal file
View File

0
dictionary.txt Normal file
View File

50
passphrase.nimble Normal file
View File

@ -0,0 +1,50 @@
# Package
version = "0.1.0"
author = "Joseph Montanaro"
description = "Passphrase generator and dictionary builder"
license = "none"
srcDir = "src"
bin = @["passphrase"]
# Dependencies
requires "nim >= 1.0"
requires "nimcrypto >= 0.4.8"
# Tasks n scripts
import strutils
proc runCmd(command: string, input = "", cache = ""): string =
let (output, exitCode) = gorgeEx(command, input, cache)
if exitCode != 0:
echo "Command failed: " & command
echo "Output:\n ", output.splitLines().join("\n ")
quit(exitCode)
result = output
proc pathExists(filename: string): bool =
let (output, code) = gorgeEx("ls " & filename)
return code == 0
task(dictionary, "Generate dictionary from BNC XML files"):
if not pathExists("BNC"):
quit("Cannot build dictionary: BNC data files not found.", 1)
echo "Building dictionary"
echo runCmd("nim c --run --threads:on -d:release -d:lto src/process.nim")
task(showexec, "this is a test"):
let (output, exitCode) = gorgeEx("ls BNC")
echo output
echo exitCode
before(build):
if not pathExists("src/dictionary.txt"):
dictionaryTask()

64065
src/counts.txt Normal file

File diff suppressed because it is too large Load Diff

64065
src/dictionary.txt Normal file

File diff suppressed because it is too large Load Diff

83
src/passphrase.nim Normal file
View File

@ -0,0 +1,83 @@
import std/[os, strutils]
import nimcrypto/sysrand
type Dictionary = object
words: string
offsets: seq[uint32]
proc `[]`(d: Dictionary, i: Natural): string =
# last word has no following start index, so we have to fake it
# also strings are indexed with ints
let slice = if i == d.offsets.high:
d.offsets[i].int .. d.words.high
else:
d.offsets[i].int ..< d.offsets[i + 1].int
result = d.words[slice]
proc len(d: Dictionary): int =
result = d.offsets.len
proc loadWords(): Dictionary =
for word in staticRead("dictionary.txt").strip().splitLines():
let startIdx = result.words.len.uint32
result.offsets.add(startIdx)
result.words.add(word)
const dict = loadWords()
proc genPassphrase(length, dictSize: int): string =
if dictSize < 100 or dictSize > dict.len:
quit("Dictionary size must be between 100 and " & $dict.len, 1)
var rands = newSeq[uint64](length)
discard randomBytes(rands)
var words: seq[string]
for r in rands:
let i = r mod dictSize.uint64
words.add(dict[i])
result = words.join(" ")
const help = """Usage:
passphrase [LENGTH] [DICTSIZE]
Defaults to length of 4 and dictionary size of 25,000."""
proc parseInput(): (int, int) =
let params = commandLineParams()
if "-h" in params or "--help" in params:
echo help
quit(0)
var
length = 4
dictSize = 25_000
if params.len > 0:
try:
length = parseInt(params[0])
except ValueError:
quit(params[0] & " is not a valid passphrase length.", 1)
if params.len > 1:
try:
dictSize = parseInt(params[1])
if dictSize < 100 or dictSize > dict.len:
quit("Dictionary size must be between 100 and " & $dict.len, 1)
except ValueError:
quit(params[1] & " is not a valid dictionary size.", 1)
result = (length, dictSize)
when isMainModule:
let (length, dictSize) = parseInput()
echo genPassphrase(length, dictSize)

72
src/process.nim Normal file
View File

@ -0,0 +1,72 @@
import std/[os, cpuinfo, times, monotimes]
import std/[streams, strutils, tables]
import scanxml
proc save(wordCounts: CountTable; dictName, countName: string) =
let dictFile = openFileStream(dictName, fmWrite)
let countFile = openFileStream(countName, fmWrite)
for word, count in wordCounts:
if count >= 3:
dictFile.writeLine(word)
countFile.writeLine($count & " " & word)
else:
break
var
threadResults: Channel[CountTable[string]]
progress: Channel[int]
threadResults.open()
progress.open()
proc processFiles(filenames: seq[string]) =
var counts: CountTable[string]
for file in filenames:
for word in iterWords(file):
counts.inc(word)
progress.send(1)
threadResults.send(counts)
when isMainModule:
let start = getMonoTime()
let basePath = r"../BNC/2554/download/Texts/"
var paths: seq[string]
for path in walkDirRec(basePath):
if path.endsWith(".xml"):
paths.add(path)
let numThreads = countProcessors()
var threads = newSeq[Thread[seq[string]]](numThreads)
var lastIdx = 0
for i, t in threads.mpairs:
var chunksize = paths.len div numThreads
if i < paths.len mod numThreads:
chunksize += 1
let newIdx = lastIdx + chunksize
let chunk = paths[lastIdx ..< newIdx]
lastIdx = newIdx
createThread(t, processFiles, chunk)
var processed = 0
for i in 0 .. paths.high:
processed += progress.recv()
stdout.write("Processing files: " & $processed & "\r")
stdout.flushFile
echo ""
var counts: CountTable[string]
for i in 0 .. threads.high:
let subCounts = threadResults.recv()
for word, count in subCounts:
counts.inc(word, count)
counts.sort()
save(counts, "dictionary.txt", "counts.txt")
echo "Done. Finished in ", (getMonoTime() - start).inMilliseconds.float / 1000, " seconds."

74
src/scanxml.nim Normal file
View File

@ -0,0 +1,74 @@
import std/[streams, options, parsexml]
const validCodes = [
"AJ0", "AJC", "AJS", "AVQ", "CJS", "DT0", "DTQ", "NN0", "NN1", "NN2",
"PNI", "PNQ", "PNX", "VVB", "VVD", "VVG", "VVI", "VVN", "VVZ"
]
type Word = object
code: string
headword: string
proc isValid(headword: string): bool =
for c in headword:
if ord(c) > 127:
return false # who cares about non-ascii anyway, it's hard to type
case c:
of '1', '2', '3', '4', '5', '6', '7', '8', '9', '0':
return false
of '$', '-', '\'', '/', ',', '.', '%':
return false
of '(', ')', '[', ']', '{', '}':
return false
else:
discard
return true
proc getWordAttrs(x: var XmlParser): Option[Word] =
var w: Word
x.next()
while x.kind != xmlElementClose and x.kind != xmlElementOpen:
if x.kind == xmlAttribute:
# echo "found an attribute: ", x.attrKey
if x.attrKey == "c5":
w.code = x.attrValue
if x.attrKey == "hw":
w.headword = x.attrValue
if w.code.len > 0 and w.headword.len > 0:
return some(w)
x.next()
iterator iterWords*(filename: string): string =
var file: FileStream
try:
file = openFileStream(filename)
except IOError as e:
echo "Failed to open file: ", filename
echo "Message: ", e.msg
var x: XmlParser
x.open(file, filename)
x.next()
while x.kind != xmlEof:
x.next()
if x.kind == xmlElementOpen and x.elementName == "w":
let res = x.getWordAttrs()
if res.isSome:
let word = res.get()
if word.code in validCodes and isValid(word.headword):
yield word.headword
file.close()
when isMainModule:
let path = r"C:\Users\Joe\Documents\Code\words\BNC\2554\download\Texts\A\A0\A00.xml"
var count = 0
for word in iterWords(path):
inc count
echo "words found: ", count