100 lines
2.5 KiB
Nim
100 lines
2.5 KiB
Nim
import std/[os, cpuinfo, times, monotimes]
|
|
import std/[streams, sequtils, strutils, tables]
|
|
import scanxml
|
|
|
|
|
|
iterator ichunkate(items: seq, numChunks: Natural): (int, seq) =
|
|
var chunkStart = 0
|
|
for i in 0 ..< numChunks:
|
|
let chunkLen =
|
|
if i < items.len mod numChunks:
|
|
(items.len div numChunks) + 1
|
|
else:
|
|
items.len div numChunks
|
|
let chunkEnd = chunkStart + chunkLen
|
|
yield (i, items[chunkStart ..< chunkEnd])
|
|
chunkStart = chunkEnd
|
|
|
|
|
|
proc save(wordCounts: CountTable; dictName, countName: string) =
|
|
let dictFile = openFileStream(dictName, fmWrite)
|
|
let countFile = openFileStream(countName, fmWrite)
|
|
for word, count in wordCounts:
|
|
if count >= 3:
|
|
dictFile.writeLine(word)
|
|
countFile.writeLine($count & " " & word)
|
|
else:
|
|
break
|
|
|
|
|
|
type Config = object
|
|
srcPath: string
|
|
dstPath: string
|
|
|
|
|
|
proc parseInput(): Config =
|
|
result.srcPath = r"../data/BNC/2554/download/Texts/"
|
|
result.dstPath = "."
|
|
|
|
if paramCount() > 0:
|
|
result.srcPath = paramStr(1)
|
|
if paramCount() > 1:
|
|
result.dstPath = paramStr(2)
|
|
|
|
if not dirExists(result.srcPath):
|
|
quit("Could not locate datafiles: directory " & result.srcPath & " does not exist.")
|
|
|
|
|
|
var
|
|
threadResults: Channel[CountTable[string]]
|
|
progress: Channel[int]
|
|
threadResults.open()
|
|
progress.open()
|
|
|
|
|
|
proc processFiles(filenames: seq[string]) =
|
|
var counts: CountTable[string]
|
|
for file in filenames:
|
|
for word in iterWords(file):
|
|
counts.inc(word)
|
|
progress.send(1)
|
|
threadResults.send(counts)
|
|
|
|
|
|
when isMainModule:
|
|
let start = getMonoTime()
|
|
|
|
let config = parseInput()
|
|
|
|
var paths: seq[string]
|
|
for path in walkDirRec(config.srcPath):
|
|
if path.endsWith(".xml"):
|
|
paths.add(path)
|
|
|
|
let numThreads = countProcessors()
|
|
var threads = newSeq[Thread[seq[string]]](numThreads)
|
|
for i, chunk in paths.ichunkate(numThreads):
|
|
createThread(threads[i], processFiles, chunk)
|
|
|
|
var processed = 0
|
|
for i in 0 .. paths.high:
|
|
processed += progress.recv()
|
|
stdout.write("Processing files: " & $processed & "\r")
|
|
stdout.flushFile
|
|
echo ""
|
|
|
|
var counts: CountTable[string]
|
|
for i in 0 .. threads.high:
|
|
let subCounts = threadResults.recv()
|
|
for word, count in subCounts:
|
|
counts.inc(word, count)
|
|
|
|
counts.sort()
|
|
|
|
let
|
|
dPath = joinPath(config.dstPath, "dictionary.txt")
|
|
cPath = joinPath(config.dstPath, "counts.txt")
|
|
save(counts, dPath, cPath)
|
|
|
|
echo "Done. Finished in ", (getMonoTime() - start).inMilliseconds.float / 1000, " seconds."
|