initial commit
This commit is contained in:
72
src/process.nim
Normal file
72
src/process.nim
Normal file
@ -0,0 +1,72 @@
|
||||
import std/[os, cpuinfo, times, monotimes]
|
||||
import std/[streams, strutils, tables]
|
||||
import scanxml
|
||||
|
||||
|
||||
proc save(wordCounts: CountTable; dictName, countName: string) =
|
||||
let dictFile = openFileStream(dictName, fmWrite)
|
||||
let countFile = openFileStream(countName, fmWrite)
|
||||
for word, count in wordCounts:
|
||||
if count >= 3:
|
||||
dictFile.writeLine(word)
|
||||
countFile.writeLine($count & " " & word)
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
var
|
||||
threadResults: Channel[CountTable[string]]
|
||||
progress: Channel[int]
|
||||
threadResults.open()
|
||||
progress.open()
|
||||
|
||||
|
||||
proc processFiles(filenames: seq[string]) =
|
||||
var counts: CountTable[string]
|
||||
for file in filenames:
|
||||
for word in iterWords(file):
|
||||
counts.inc(word)
|
||||
progress.send(1)
|
||||
threadResults.send(counts)
|
||||
|
||||
|
||||
when isMainModule:
|
||||
let start = getMonoTime()
|
||||
let basePath = r"../BNC/2554/download/Texts/"
|
||||
|
||||
var paths: seq[string]
|
||||
for path in walkDirRec(basePath):
|
||||
if path.endsWith(".xml"):
|
||||
paths.add(path)
|
||||
|
||||
let numThreads = countProcessors()
|
||||
var threads = newSeq[Thread[seq[string]]](numThreads)
|
||||
var lastIdx = 0
|
||||
for i, t in threads.mpairs:
|
||||
var chunksize = paths.len div numThreads
|
||||
if i < paths.len mod numThreads:
|
||||
chunksize += 1
|
||||
|
||||
let newIdx = lastIdx + chunksize
|
||||
let chunk = paths[lastIdx ..< newIdx]
|
||||
lastIdx = newIdx
|
||||
|
||||
createThread(t, processFiles, chunk)
|
||||
|
||||
var processed = 0
|
||||
for i in 0 .. paths.high:
|
||||
processed += progress.recv()
|
||||
stdout.write("Processing files: " & $processed & "\r")
|
||||
stdout.flushFile
|
||||
echo ""
|
||||
|
||||
var counts: CountTable[string]
|
||||
for i in 0 .. threads.high:
|
||||
let subCounts = threadResults.recv()
|
||||
for word, count in subCounts:
|
||||
counts.inc(word, count)
|
||||
|
||||
counts.sort()
|
||||
save(counts, "dictionary.txt", "counts.txt")
|
||||
|
||||
echo "Done. Finished in ", (getMonoTime() - start).inMilliseconds.float / 1000, " seconds."
|
Reference in New Issue
Block a user