Files
passphrase/src/process.nim
2021-07-28 17:01:49 -07:00

100 lines
2.5 KiB
Nim

import std/[os, cpuinfo, times, monotimes]
import std/[streams, sequtils, strutils, tables]
import scanxml
iterator ichunkate(items: seq, numChunks: Natural): (int, seq) =
var chunkStart = 0
for i in 0 ..< numChunks:
let chunkLen =
if i < items.len mod numChunks:
(items.len div numChunks) + 1
else:
items.len div numChunks
let chunkEnd = chunkStart + chunkLen
yield (i, items[chunkStart ..< chunkEnd])
chunkStart = chunkEnd
proc save(wordCounts: CountTable; dictName, countName: string) =
let dictFile = openFileStream(dictName, fmWrite)
let countFile = openFileStream(countName, fmWrite)
for word, count in wordCounts:
if count >= 3:
dictFile.writeLine(word)
countFile.writeLine($count & " " & word)
else:
break
type Config = object
srcPath: string
dstPath: string
proc parseInput(): Config =
result.srcPath = r"../data/BNC/2554/download/Texts/"
result.dstPath = "."
if paramCount() > 0:
result.srcPath = paramStr(1)
if paramCount() > 1:
result.dstPath = paramStr(2)
if not dirExists(result.srcPath):
quit("Could not locate datafiles: directory " & result.srcPath & " does not exist.")
var
threadResults: Channel[CountTable[string]]
progress: Channel[int]
threadResults.open()
progress.open()
proc processFiles(filenames: seq[string]) =
var counts: CountTable[string]
for file in filenames:
for word in iterWords(file):
counts.inc(word)
progress.send(1)
threadResults.send(counts)
when isMainModule:
let start = getMonoTime()
let config = parseInput()
var paths: seq[string]
for path in walkDirRec(config.srcPath):
if path.endsWith(".xml"):
paths.add(path)
let numThreads = countProcessors()
var threads = newSeq[Thread[seq[string]]](numThreads)
for i, chunk in paths.ichunkate(numThreads):
createThread(threads[i], processFiles, chunk)
var processed = 0
for i in 0 .. paths.high:
processed += progress.recv()
stdout.write("Processing files: " & $processed & "\r")
stdout.flushFile
echo ""
var counts: CountTable[string]
for i in 0 .. threads.high:
let subCounts = threadResults.recv()
for word, count in subCounts:
counts.inc(word, count)
counts.sort()
let
dPath = joinPath(config.dstPath, "dictionary.txt")
cPath = joinPath(config.dstPath, "counts.txt")
save(counts, dPath, cPath)
echo "Done. Finished in ", (getMonoTime() - start).inMilliseconds.float / 1000, " seconds."