Source code for multi_threading_file_compression

#!/usr/bin/env python3

import sys
import os
from pymzml.utils.utils import index_gzip
import pymzml
import glob
import multiprocessing


[docs] def main(folder, num_cpus=1): """ Creates indexed gzip mzML files from all mzMLs files in the given folder using a given number of threads. Usage: python multi_threading_file_compression.py <folder> <threads> Note: If the number of threads is larger than the number of actual possible threads, all possible threads will be used. """ max_cpus = multiprocessing.cpu_count() if int(num_cpus) > max_cpus: num_cpus = max_cpus else: num_cpus = int(num_cpus) mzml_job_list = [] for mzml_path in glob.glob(os.path.join(folder, "*.mzML")): out_path = "{0}.gz".format(mzml_path) if os.path.exists(out_path): print("Skipping: {0}".format(mzml_path)) continue mzml_job_list.append((mzml_path, out_path)) print( "Compressing {0} mzML files using {1} threads".format( len(mzml_job_list), num_cpus ) ) mp_pool = multiprocessing.Pool(num_cpus) results = mp_pool.starmap(compress_file, mzml_job_list) mp_pool.close() print("Done") return
def compress_file(file_path, out_path): print("Working on file {0}".format(file_path)) with open(file_path) as fin: fin.seek(0, 2) max_offset_len = fin.tell() max_spec_no = pymzml.run.Reader(file_path).get_spectrum_count() + 10 index_gzip( file_path, out_path, max_idx=max_spec_no, idx_len=len(str(max_offset_len)) ) print("Wrote file {0}".format(out_path)) return if __name__ == "__main__": if len(sys.argv) < 2: print(main.__doc__) exit() else: main(*sys.argv[1:])