initial commit
This commit is contained in:
commit
b388552a27
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
*.exe
|
||||
BNC/*
|
0
counts.txt
Normal file
0
counts.txt
Normal file
0
dictionary.txt
Normal file
0
dictionary.txt
Normal file
50
passphrase.nimble
Normal file
50
passphrase.nimble
Normal file
@ -0,0 +1,50 @@
|
||||
# Package
|
||||
|
||||
version = "0.1.0"
|
||||
author = "Joseph Montanaro"
|
||||
description = "Passphrase generator and dictionary builder"
|
||||
license = "none"
|
||||
srcDir = "src"
|
||||
bin = @["passphrase"]
|
||||
|
||||
|
||||
# Dependencies
|
||||
|
||||
requires "nim >= 1.0"
|
||||
requires "nimcrypto >= 0.4.8"
|
||||
|
||||
|
||||
# Tasks n scripts
|
||||
|
||||
import strutils
|
||||
|
||||
proc runCmd(command: string, input = "", cache = ""): string =
|
||||
let (output, exitCode) = gorgeEx(command, input, cache)
|
||||
if exitCode != 0:
|
||||
echo "Command failed: " & command
|
||||
echo "Output:\n ", output.splitLines().join("\n ")
|
||||
quit(exitCode)
|
||||
result = output
|
||||
|
||||
proc pathExists(filename: string): bool =
|
||||
let (output, code) = gorgeEx("ls " & filename)
|
||||
return code == 0
|
||||
|
||||
|
||||
task(dictionary, "Generate dictionary from BNC XML files"):
|
||||
if not pathExists("BNC"):
|
||||
quit("Cannot build dictionary: BNC data files not found.", 1)
|
||||
echo "Building dictionary"
|
||||
echo runCmd("nim c --run --threads:on -d:release -d:lto src/process.nim")
|
||||
|
||||
|
||||
task(showexec, "this is a test"):
|
||||
let (output, exitCode) = gorgeEx("ls BNC")
|
||||
echo output
|
||||
echo exitCode
|
||||
|
||||
|
||||
|
||||
before(build):
|
||||
if not pathExists("src/dictionary.txt"):
|
||||
dictionaryTask()
|
64065
src/counts.txt
Normal file
64065
src/counts.txt
Normal file
File diff suppressed because it is too large
Load Diff
64065
src/dictionary.txt
Normal file
64065
src/dictionary.txt
Normal file
File diff suppressed because it is too large
Load Diff
83
src/passphrase.nim
Normal file
83
src/passphrase.nim
Normal file
@ -0,0 +1,83 @@
|
||||
import std/[os, strutils]
|
||||
import nimcrypto/sysrand
|
||||
|
||||
|
||||
type Dictionary = object
|
||||
words: string
|
||||
offsets: seq[uint32]
|
||||
|
||||
|
||||
proc `[]`(d: Dictionary, i: Natural): string =
|
||||
# last word has no following start index, so we have to fake it
|
||||
# also strings are indexed with ints
|
||||
let slice = if i == d.offsets.high:
|
||||
d.offsets[i].int .. d.words.high
|
||||
else:
|
||||
d.offsets[i].int ..< d.offsets[i + 1].int
|
||||
|
||||
result = d.words[slice]
|
||||
|
||||
|
||||
proc len(d: Dictionary): int =
|
||||
result = d.offsets.len
|
||||
|
||||
|
||||
proc loadWords(): Dictionary =
|
||||
for word in staticRead("dictionary.txt").strip().splitLines():
|
||||
let startIdx = result.words.len.uint32
|
||||
result.offsets.add(startIdx)
|
||||
result.words.add(word)
|
||||
|
||||
|
||||
const dict = loadWords()
|
||||
|
||||
|
||||
proc genPassphrase(length, dictSize: int): string =
|
||||
if dictSize < 100 or dictSize > dict.len:
|
||||
quit("Dictionary size must be between 100 and " & $dict.len, 1)
|
||||
|
||||
var rands = newSeq[uint64](length)
|
||||
discard randomBytes(rands)
|
||||
|
||||
var words: seq[string]
|
||||
for r in rands:
|
||||
let i = r mod dictSize.uint64
|
||||
words.add(dict[i])
|
||||
result = words.join(" ")
|
||||
|
||||
|
||||
const help = """Usage:
|
||||
passphrase [LENGTH] [DICTSIZE]
|
||||
|
||||
Defaults to length of 4 and dictionary size of 25,000."""
|
||||
|
||||
proc parseInput(): (int, int) =
|
||||
let params = commandLineParams()
|
||||
if "-h" in params or "--help" in params:
|
||||
echo help
|
||||
quit(0)
|
||||
|
||||
var
|
||||
length = 4
|
||||
dictSize = 25_000
|
||||
|
||||
if params.len > 0:
|
||||
try:
|
||||
length = parseInt(params[0])
|
||||
except ValueError:
|
||||
quit(params[0] & " is not a valid passphrase length.", 1)
|
||||
|
||||
if params.len > 1:
|
||||
try:
|
||||
dictSize = parseInt(params[1])
|
||||
if dictSize < 100 or dictSize > dict.len:
|
||||
quit("Dictionary size must be between 100 and " & $dict.len, 1)
|
||||
except ValueError:
|
||||
quit(params[1] & " is not a valid dictionary size.", 1)
|
||||
|
||||
result = (length, dictSize)
|
||||
|
||||
|
||||
when isMainModule:
|
||||
let (length, dictSize) = parseInput()
|
||||
echo genPassphrase(length, dictSize)
|
72
src/process.nim
Normal file
72
src/process.nim
Normal file
@ -0,0 +1,72 @@
|
||||
import std/[os, cpuinfo, times, monotimes]
|
||||
import std/[streams, strutils, tables]
|
||||
import scanxml
|
||||
|
||||
|
||||
proc save(wordCounts: CountTable; dictName, countName: string) =
|
||||
let dictFile = openFileStream(dictName, fmWrite)
|
||||
let countFile = openFileStream(countName, fmWrite)
|
||||
for word, count in wordCounts:
|
||||
if count >= 3:
|
||||
dictFile.writeLine(word)
|
||||
countFile.writeLine($count & " " & word)
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
var
|
||||
threadResults: Channel[CountTable[string]]
|
||||
progress: Channel[int]
|
||||
threadResults.open()
|
||||
progress.open()
|
||||
|
||||
|
||||
proc processFiles(filenames: seq[string]) =
|
||||
var counts: CountTable[string]
|
||||
for file in filenames:
|
||||
for word in iterWords(file):
|
||||
counts.inc(word)
|
||||
progress.send(1)
|
||||
threadResults.send(counts)
|
||||
|
||||
|
||||
when isMainModule:
|
||||
let start = getMonoTime()
|
||||
let basePath = r"../BNC/2554/download/Texts/"
|
||||
|
||||
var paths: seq[string]
|
||||
for path in walkDirRec(basePath):
|
||||
if path.endsWith(".xml"):
|
||||
paths.add(path)
|
||||
|
||||
let numThreads = countProcessors()
|
||||
var threads = newSeq[Thread[seq[string]]](numThreads)
|
||||
var lastIdx = 0
|
||||
for i, t in threads.mpairs:
|
||||
var chunksize = paths.len div numThreads
|
||||
if i < paths.len mod numThreads:
|
||||
chunksize += 1
|
||||
|
||||
let newIdx = lastIdx + chunksize
|
||||
let chunk = paths[lastIdx ..< newIdx]
|
||||
lastIdx = newIdx
|
||||
|
||||
createThread(t, processFiles, chunk)
|
||||
|
||||
var processed = 0
|
||||
for i in 0 .. paths.high:
|
||||
processed += progress.recv()
|
||||
stdout.write("Processing files: " & $processed & "\r")
|
||||
stdout.flushFile
|
||||
echo ""
|
||||
|
||||
var counts: CountTable[string]
|
||||
for i in 0 .. threads.high:
|
||||
let subCounts = threadResults.recv()
|
||||
for word, count in subCounts:
|
||||
counts.inc(word, count)
|
||||
|
||||
counts.sort()
|
||||
save(counts, "dictionary.txt", "counts.txt")
|
||||
|
||||
echo "Done. Finished in ", (getMonoTime() - start).inMilliseconds.float / 1000, " seconds."
|
74
src/scanxml.nim
Normal file
74
src/scanxml.nim
Normal file
@ -0,0 +1,74 @@
|
||||
import std/[streams, options, parsexml]
|
||||
|
||||
|
||||
const validCodes = [
|
||||
"AJ0", "AJC", "AJS", "AVQ", "CJS", "DT0", "DTQ", "NN0", "NN1", "NN2",
|
||||
"PNI", "PNQ", "PNX", "VVB", "VVD", "VVG", "VVI", "VVN", "VVZ"
|
||||
]
|
||||
|
||||
|
||||
type Word = object
|
||||
code: string
|
||||
headword: string
|
||||
|
||||
|
||||
proc isValid(headword: string): bool =
|
||||
for c in headword:
|
||||
if ord(c) > 127:
|
||||
return false # who cares about non-ascii anyway, it's hard to type
|
||||
case c:
|
||||
of '1', '2', '3', '4', '5', '6', '7', '8', '9', '0':
|
||||
return false
|
||||
of '$', '-', '\'', '/', ',', '.', '%':
|
||||
return false
|
||||
of '(', ')', '[', ']', '{', '}':
|
||||
return false
|
||||
else:
|
||||
discard
|
||||
return true
|
||||
|
||||
|
||||
proc getWordAttrs(x: var XmlParser): Option[Word] =
|
||||
var w: Word
|
||||
x.next()
|
||||
while x.kind != xmlElementClose and x.kind != xmlElementOpen:
|
||||
if x.kind == xmlAttribute:
|
||||
# echo "found an attribute: ", x.attrKey
|
||||
if x.attrKey == "c5":
|
||||
w.code = x.attrValue
|
||||
if x.attrKey == "hw":
|
||||
w.headword = x.attrValue
|
||||
|
||||
if w.code.len > 0 and w.headword.len > 0:
|
||||
return some(w)
|
||||
x.next()
|
||||
|
||||
|
||||
iterator iterWords*(filename: string): string =
|
||||
var file: FileStream
|
||||
try:
|
||||
file = openFileStream(filename)
|
||||
except IOError as e:
|
||||
echo "Failed to open file: ", filename
|
||||
echo "Message: ", e.msg
|
||||
|
||||
var x: XmlParser
|
||||
x.open(file, filename)
|
||||
x.next()
|
||||
while x.kind != xmlEof:
|
||||
x.next()
|
||||
if x.kind == xmlElementOpen and x.elementName == "w":
|
||||
let res = x.getWordAttrs()
|
||||
if res.isSome:
|
||||
let word = res.get()
|
||||
if word.code in validCodes and isValid(word.headword):
|
||||
yield word.headword
|
||||
file.close()
|
||||
|
||||
|
||||
when isMainModule:
|
||||
let path = r"C:\Users\Joe\Documents\Code\words\BNC\2554\download\Texts\A\A0\A00.xml"
|
||||
var count = 0
|
||||
for word in iterWords(path):
|
||||
inc count
|
||||
echo "words found: ", count
|
Loading…
x
Reference in New Issue
Block a user