initial commit
This commit is contained in:
commit
b388552a27
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
*.exe
|
||||||
|
BNC/*
|
0
counts.txt
Normal file
0
counts.txt
Normal file
0
dictionary.txt
Normal file
0
dictionary.txt
Normal file
50
passphrase.nimble
Normal file
50
passphrase.nimble
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
# Package
|
||||||
|
|
||||||
|
version = "0.1.0"
|
||||||
|
author = "Joseph Montanaro"
|
||||||
|
description = "Passphrase generator and dictionary builder"
|
||||||
|
license = "none"
|
||||||
|
srcDir = "src"
|
||||||
|
bin = @["passphrase"]
|
||||||
|
|
||||||
|
|
||||||
|
# Dependencies
|
||||||
|
|
||||||
|
requires "nim >= 1.0"
|
||||||
|
requires "nimcrypto >= 0.4.8"
|
||||||
|
|
||||||
|
|
||||||
|
# Tasks n scripts
|
||||||
|
|
||||||
|
import strutils
|
||||||
|
|
||||||
|
proc runCmd(command: string, input = "", cache = ""): string =
|
||||||
|
let (output, exitCode) = gorgeEx(command, input, cache)
|
||||||
|
if exitCode != 0:
|
||||||
|
echo "Command failed: " & command
|
||||||
|
echo "Output:\n ", output.splitLines().join("\n ")
|
||||||
|
quit(exitCode)
|
||||||
|
result = output
|
||||||
|
|
||||||
|
proc pathExists(filename: string): bool =
|
||||||
|
let (output, code) = gorgeEx("ls " & filename)
|
||||||
|
return code == 0
|
||||||
|
|
||||||
|
|
||||||
|
task(dictionary, "Generate dictionary from BNC XML files"):
|
||||||
|
if not pathExists("BNC"):
|
||||||
|
quit("Cannot build dictionary: BNC data files not found.", 1)
|
||||||
|
echo "Building dictionary"
|
||||||
|
echo runCmd("nim c --run --threads:on -d:release -d:lto src/process.nim")
|
||||||
|
|
||||||
|
|
||||||
|
task(showexec, "this is a test"):
|
||||||
|
let (output, exitCode) = gorgeEx("ls BNC")
|
||||||
|
echo output
|
||||||
|
echo exitCode
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
before(build):
|
||||||
|
if not pathExists("src/dictionary.txt"):
|
||||||
|
dictionaryTask()
|
64065
src/counts.txt
Normal file
64065
src/counts.txt
Normal file
File diff suppressed because it is too large
Load Diff
64065
src/dictionary.txt
Normal file
64065
src/dictionary.txt
Normal file
File diff suppressed because it is too large
Load Diff
83
src/passphrase.nim
Normal file
83
src/passphrase.nim
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
import std/[os, strutils]
|
||||||
|
import nimcrypto/sysrand
|
||||||
|
|
||||||
|
|
||||||
|
type Dictionary = object
|
||||||
|
words: string
|
||||||
|
offsets: seq[uint32]
|
||||||
|
|
||||||
|
|
||||||
|
proc `[]`(d: Dictionary, i: Natural): string =
|
||||||
|
# last word has no following start index, so we have to fake it
|
||||||
|
# also strings are indexed with ints
|
||||||
|
let slice = if i == d.offsets.high:
|
||||||
|
d.offsets[i].int .. d.words.high
|
||||||
|
else:
|
||||||
|
d.offsets[i].int ..< d.offsets[i + 1].int
|
||||||
|
|
||||||
|
result = d.words[slice]
|
||||||
|
|
||||||
|
|
||||||
|
proc len(d: Dictionary): int =
|
||||||
|
result = d.offsets.len
|
||||||
|
|
||||||
|
|
||||||
|
proc loadWords(): Dictionary =
|
||||||
|
for word in staticRead("dictionary.txt").strip().splitLines():
|
||||||
|
let startIdx = result.words.len.uint32
|
||||||
|
result.offsets.add(startIdx)
|
||||||
|
result.words.add(word)
|
||||||
|
|
||||||
|
|
||||||
|
const dict = loadWords()
|
||||||
|
|
||||||
|
|
||||||
|
proc genPassphrase(length, dictSize: int): string =
|
||||||
|
if dictSize < 100 or dictSize > dict.len:
|
||||||
|
quit("Dictionary size must be between 100 and " & $dict.len, 1)
|
||||||
|
|
||||||
|
var rands = newSeq[uint64](length)
|
||||||
|
discard randomBytes(rands)
|
||||||
|
|
||||||
|
var words: seq[string]
|
||||||
|
for r in rands:
|
||||||
|
let i = r mod dictSize.uint64
|
||||||
|
words.add(dict[i])
|
||||||
|
result = words.join(" ")
|
||||||
|
|
||||||
|
|
||||||
|
const help = """Usage:
|
||||||
|
passphrase [LENGTH] [DICTSIZE]
|
||||||
|
|
||||||
|
Defaults to length of 4 and dictionary size of 25,000."""
|
||||||
|
|
||||||
|
proc parseInput(): (int, int) =
|
||||||
|
let params = commandLineParams()
|
||||||
|
if "-h" in params or "--help" in params:
|
||||||
|
echo help
|
||||||
|
quit(0)
|
||||||
|
|
||||||
|
var
|
||||||
|
length = 4
|
||||||
|
dictSize = 25_000
|
||||||
|
|
||||||
|
if params.len > 0:
|
||||||
|
try:
|
||||||
|
length = parseInt(params[0])
|
||||||
|
except ValueError:
|
||||||
|
quit(params[0] & " is not a valid passphrase length.", 1)
|
||||||
|
|
||||||
|
if params.len > 1:
|
||||||
|
try:
|
||||||
|
dictSize = parseInt(params[1])
|
||||||
|
if dictSize < 100 or dictSize > dict.len:
|
||||||
|
quit("Dictionary size must be between 100 and " & $dict.len, 1)
|
||||||
|
except ValueError:
|
||||||
|
quit(params[1] & " is not a valid dictionary size.", 1)
|
||||||
|
|
||||||
|
result = (length, dictSize)
|
||||||
|
|
||||||
|
|
||||||
|
when isMainModule:
|
||||||
|
let (length, dictSize) = parseInput()
|
||||||
|
echo genPassphrase(length, dictSize)
|
72
src/process.nim
Normal file
72
src/process.nim
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import std/[os, cpuinfo, times, monotimes]
|
||||||
|
import std/[streams, strutils, tables]
|
||||||
|
import scanxml
|
||||||
|
|
||||||
|
|
||||||
|
proc save(wordCounts: CountTable; dictName, countName: string) =
|
||||||
|
let dictFile = openFileStream(dictName, fmWrite)
|
||||||
|
let countFile = openFileStream(countName, fmWrite)
|
||||||
|
for word, count in wordCounts:
|
||||||
|
if count >= 3:
|
||||||
|
dictFile.writeLine(word)
|
||||||
|
countFile.writeLine($count & " " & word)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
var
|
||||||
|
threadResults: Channel[CountTable[string]]
|
||||||
|
progress: Channel[int]
|
||||||
|
threadResults.open()
|
||||||
|
progress.open()
|
||||||
|
|
||||||
|
|
||||||
|
proc processFiles(filenames: seq[string]) =
|
||||||
|
var counts: CountTable[string]
|
||||||
|
for file in filenames:
|
||||||
|
for word in iterWords(file):
|
||||||
|
counts.inc(word)
|
||||||
|
progress.send(1)
|
||||||
|
threadResults.send(counts)
|
||||||
|
|
||||||
|
|
||||||
|
when isMainModule:
|
||||||
|
let start = getMonoTime()
|
||||||
|
let basePath = r"../BNC/2554/download/Texts/"
|
||||||
|
|
||||||
|
var paths: seq[string]
|
||||||
|
for path in walkDirRec(basePath):
|
||||||
|
if path.endsWith(".xml"):
|
||||||
|
paths.add(path)
|
||||||
|
|
||||||
|
let numThreads = countProcessors()
|
||||||
|
var threads = newSeq[Thread[seq[string]]](numThreads)
|
||||||
|
var lastIdx = 0
|
||||||
|
for i, t in threads.mpairs:
|
||||||
|
var chunksize = paths.len div numThreads
|
||||||
|
if i < paths.len mod numThreads:
|
||||||
|
chunksize += 1
|
||||||
|
|
||||||
|
let newIdx = lastIdx + chunksize
|
||||||
|
let chunk = paths[lastIdx ..< newIdx]
|
||||||
|
lastIdx = newIdx
|
||||||
|
|
||||||
|
createThread(t, processFiles, chunk)
|
||||||
|
|
||||||
|
var processed = 0
|
||||||
|
for i in 0 .. paths.high:
|
||||||
|
processed += progress.recv()
|
||||||
|
stdout.write("Processing files: " & $processed & "\r")
|
||||||
|
stdout.flushFile
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
var counts: CountTable[string]
|
||||||
|
for i in 0 .. threads.high:
|
||||||
|
let subCounts = threadResults.recv()
|
||||||
|
for word, count in subCounts:
|
||||||
|
counts.inc(word, count)
|
||||||
|
|
||||||
|
counts.sort()
|
||||||
|
save(counts, "dictionary.txt", "counts.txt")
|
||||||
|
|
||||||
|
echo "Done. Finished in ", (getMonoTime() - start).inMilliseconds.float / 1000, " seconds."
|
74
src/scanxml.nim
Normal file
74
src/scanxml.nim
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
import std/[streams, options, parsexml]
|
||||||
|
|
||||||
|
|
||||||
|
const validCodes = [
|
||||||
|
"AJ0", "AJC", "AJS", "AVQ", "CJS", "DT0", "DTQ", "NN0", "NN1", "NN2",
|
||||||
|
"PNI", "PNQ", "PNX", "VVB", "VVD", "VVG", "VVI", "VVN", "VVZ"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
type Word = object
|
||||||
|
code: string
|
||||||
|
headword: string
|
||||||
|
|
||||||
|
|
||||||
|
proc isValid(headword: string): bool =
|
||||||
|
for c in headword:
|
||||||
|
if ord(c) > 127:
|
||||||
|
return false # who cares about non-ascii anyway, it's hard to type
|
||||||
|
case c:
|
||||||
|
of '1', '2', '3', '4', '5', '6', '7', '8', '9', '0':
|
||||||
|
return false
|
||||||
|
of '$', '-', '\'', '/', ',', '.', '%':
|
||||||
|
return false
|
||||||
|
of '(', ')', '[', ']', '{', '}':
|
||||||
|
return false
|
||||||
|
else:
|
||||||
|
discard
|
||||||
|
return true
|
||||||
|
|
||||||
|
|
||||||
|
proc getWordAttrs(x: var XmlParser): Option[Word] =
|
||||||
|
var w: Word
|
||||||
|
x.next()
|
||||||
|
while x.kind != xmlElementClose and x.kind != xmlElementOpen:
|
||||||
|
if x.kind == xmlAttribute:
|
||||||
|
# echo "found an attribute: ", x.attrKey
|
||||||
|
if x.attrKey == "c5":
|
||||||
|
w.code = x.attrValue
|
||||||
|
if x.attrKey == "hw":
|
||||||
|
w.headword = x.attrValue
|
||||||
|
|
||||||
|
if w.code.len > 0 and w.headword.len > 0:
|
||||||
|
return some(w)
|
||||||
|
x.next()
|
||||||
|
|
||||||
|
|
||||||
|
iterator iterWords*(filename: string): string =
|
||||||
|
var file: FileStream
|
||||||
|
try:
|
||||||
|
file = openFileStream(filename)
|
||||||
|
except IOError as e:
|
||||||
|
echo "Failed to open file: ", filename
|
||||||
|
echo "Message: ", e.msg
|
||||||
|
|
||||||
|
var x: XmlParser
|
||||||
|
x.open(file, filename)
|
||||||
|
x.next()
|
||||||
|
while x.kind != xmlEof:
|
||||||
|
x.next()
|
||||||
|
if x.kind == xmlElementOpen and x.elementName == "w":
|
||||||
|
let res = x.getWordAttrs()
|
||||||
|
if res.isSome:
|
||||||
|
let word = res.get()
|
||||||
|
if word.code in validCodes and isValid(word.headword):
|
||||||
|
yield word.headword
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
when isMainModule:
|
||||||
|
let path = r"C:\Users\Joe\Documents\Code\words\BNC\2554\download\Texts\A\A0\A00.xml"
|
||||||
|
var count = 0
|
||||||
|
for word in iterWords(path):
|
||||||
|
inc count
|
||||||
|
echo "words found: ", count
|
Loading…
x
Reference in New Issue
Block a user