import std/[os, cpuinfo, times, monotimes] import std/[streams, sequtils, strutils, tables] import scanxml iterator ichunkate(items: seq, numChunks: Natural): (int, seq) = var chunkStart = 0 for i in 0 ..< numChunks: let chunkLen = if i < items.len mod numChunks: (items.len div numChunks) + 1 else: items.len div numChunks let chunkEnd = chunkStart + chunkLen yield (i, items[chunkStart ..< chunkEnd]) chunkStart = chunkEnd proc save(wordCounts: CountTable; dictName, countName: string) = let dictFile = openFileStream(dictName, fmWrite) let countFile = openFileStream(countName, fmWrite) for word, count in wordCounts: if count >= 3: dictFile.writeLine(word) countFile.writeLine($count & " " & word) else: break var threadResults: Channel[CountTable[string]] progress: Channel[int] threadResults.open() progress.open() proc processFiles(filenames: seq[string]) = var counts: CountTable[string] for file in filenames: for word in iterWords(file): counts.inc(word) progress.send(1) threadResults.send(counts) when isMainModule: let start = getMonoTime() let basePath = if paramCount() > 0: paramStr(1) else: r"../BNC/2554/download/Texts/" var paths: seq[string] for path in walkDirRec(basePath): if path.endsWith(".xml"): paths.add(path) let numThreads = countProcessors() var threads = newSeq[Thread[seq[string]]](numThreads) for i, chunk in paths.ichunkate(numThreads): createThread(threads[i], processFiles, chunk) var processed = 0 for i in 0 .. paths.high: processed += progress.recv() stdout.write("Processing files: " & $processed & "\r") stdout.flushFile echo "" var counts: CountTable[string] for i in 0 .. threads.high: let subCounts = threadResults.recv() for word, count in subCounts: counts.inc(word, count) counts.sort() save(counts, "dictionary.txt", "counts.txt") echo "Done. Finished in ", (getMonoTime() - start).inMilliseconds.float / 1000, " seconds."