initial commit

This commit is contained in:
2021-07-26 06:48:01 -07:00
commit b388552a27
9 changed files with 128411 additions and 0 deletions

72
src/process.nim Normal file
View File

@ -0,0 +1,72 @@
import std/[os, cpuinfo, times, monotimes]
import std/[streams, strutils, tables]
import scanxml
proc save(wordCounts: CountTable; dictName, countName: string) =
let dictFile = openFileStream(dictName, fmWrite)
let countFile = openFileStream(countName, fmWrite)
for word, count in wordCounts:
if count >= 3:
dictFile.writeLine(word)
countFile.writeLine($count & " " & word)
else:
break
var
threadResults: Channel[CountTable[string]]
progress: Channel[int]
threadResults.open()
progress.open()
proc processFiles(filenames: seq[string]) =
var counts: CountTable[string]
for file in filenames:
for word in iterWords(file):
counts.inc(word)
progress.send(1)
threadResults.send(counts)
when isMainModule:
let start = getMonoTime()
let basePath = r"../BNC/2554/download/Texts/"
var paths: seq[string]
for path in walkDirRec(basePath):
if path.endsWith(".xml"):
paths.add(path)
let numThreads = countProcessors()
var threads = newSeq[Thread[seq[string]]](numThreads)
var lastIdx = 0
for i, t in threads.mpairs:
var chunksize = paths.len div numThreads
if i < paths.len mod numThreads:
chunksize += 1
let newIdx = lastIdx + chunksize
let chunk = paths[lastIdx ..< newIdx]
lastIdx = newIdx
createThread(t, processFiles, chunk)
var processed = 0
for i in 0 .. paths.high:
processed += progress.recv()
stdout.write("Processing files: " & $processed & "\r")
stdout.flushFile
echo ""
var counts: CountTable[string]
for i in 0 .. threads.high:
let subCounts = threadResults.recv()
for word, count in subCounts:
counts.inc(word, count)
counts.sort()
save(counts, "dictionary.txt", "counts.txt")
echo "Done. Finished in ", (getMonoTime() - start).inMilliseconds.float / 1000, " seconds."