package edu.umd.marbl.mhap.main;

import edu.umd.marbl.mhap.impl.MhapRuntimeException;
import edu.umd.marbl.mhap.impl.MinHashSearch;
import edu.umd.marbl.mhap.impl.SequenceId;
import edu.umd.marbl.mhap.impl.SequenceSketchStreamer;
import edu.umd.marbl.mhap.sketch.FrequencyCounts;
import edu.umd.marbl.mhap.utils.ParseOptions;
import edu.umd.marbl.mhap.utils.Utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.Locale;

/* loaded from: input_file:edu/umd/marbl/mhap/main/MhapMain.class */
public final class MhapMain {
    private final double acceptScore;
    private final String inFile;
    private final FrequencyCounts kmerFilter;
    private final int kmerSize;
    private final double maxShift;
    private final int minStoreLength;
    private final int minOlapLength;
    private final boolean noSelf;
    private final int numHashes;
    private final int numMinMatches;
    protected final int numThreads;
    private final int orderedKmerSize;
    private final int orderedSketchSize;
    private final String processFile;
    private final String toFile;
    private final double repeatWeight;
    private final boolean doReverseCompliment;
    private static final double DEFAULT_OVERLAP_ACCEPT_SCORE = 0.78d;
    private static final double DEFAULT_REPEAT_WEIGHT = 0.9d;
    private static final double DEFAULT_REPEAT_IDF_SCALE = 3.0d;
    private static final double DEFAULT_FILTER_CUTOFF = 1.0E-5d;
    private static final int DEFAULT_KMER_SIZE = 16;
    private static final double DEFAULT_MAX_SHIFT_PERCENT = 0.2d;
    private static final int DEFAULT_MIN_STORE_LENGTH = 0;
    private static final int DEFAULT_MIN_OVL_LENGTH = 116;
    private static final int DEFAULT_NUM_MIN_MATCHES = 3;
    private static final int DEFAULT_NUM_THREADS = Runtime.getRuntime().availableProcessors();
    private static final int DEFAULT_NUM_WORDS = 512;
    private static final int DEFAULT_ORDERED_KMER_SIZE = 12;
    private static final int DEFAULT_ORDERED_SKETCH_SIZE = 1536;

    public static void main(String[] strArr) throws Exception {
        Locale.setDefault(Locale.US);
        ParseOptions parseOptions = new ParseOptions();
        parseOptions.addStartTextLine("MHAP: MinHash Alignment Protocol. A tool for finding overlaps of long-read sequences (such as PacBio or Nanopore) in bioinformatics.");
        parseOptions.addStartTextLine("\tVersion: " + MhapMain.class.getPackage().getImplementationVersion());
        parseOptions.addStartTextLine("\tUsage 1 (direct execution): java -server -Xmx<memory> -jar <MHAP jar> -s<fasta/dat from/self file> [-q<fasta/dat to file>] [-f<kmer filter list, must be sorted>]");
        parseOptions.addStartTextLine("\tUsage 2 (generate precomputed binaries): java -server -Xmx<memory> -jar <MHAP jar> -p<directory of fasta files> -q <output directory> [-f<kmer filter list, must be sorted>]");
        parseOptions.addOption("-s", "Usage 1 only. The FASTA or binary dat file (see Usage 2) of reads that will be stored in a box, and that all subsequent reads will be compared to.", "");
        parseOptions.addOption("-q", "Usage 1: The FASTA file of reads, or a directory of files, that will be compared to the set of reads in the box (see -s). Usage 2: The output directory for the binary formatted dat files.", "");
        parseOptions.addOption("-p", "Usage 2 only. The directory containing FASTA files that should be converted to binary format for storage.", "");
        parseOptions.addOption("-f", "k-mer filter file used for filtering out highly repetative k-mers. Must be sorted in descending order of frequency (second column).", "");
        parseOptions.addOption("-k", "[int], k-mer size used for MinHashing. The k-mer size for second stage filter is seperate, and can also be modified.", 16);
        parseOptions.addOption("--num-hashes", "[int], Number of min-mers to be used in MinHashing.", Integer.valueOf(DEFAULT_NUM_WORDS));
        parseOptions.addOption("--threshold", "[double], The threshold cutoff for the second stage sort-merge filter. This is based on the identity score computed from the Jaccard distance of k-mers (size given by ordered-kmer-size) in the overlapping regions.", Double.valueOf(DEFAULT_OVERLAP_ACCEPT_SCORE));
        parseOptions.addOption("--filter-threshold", "[double], The cutoff at which the k-mer in the k-mer filter file is considered repetitive. This value for a specific k-mer is specified in the second column in the filter file. If no filter file is provided, this option is ignored.", Double.valueOf(DEFAULT_FILTER_CUTOFF));
        parseOptions.addOption("--max-shift", "[double], Region size to the left and right of the estimated overlap, as derived from the median shift and sequence length, where a k-mer matches are still considered valid. Second stage filter only.", Double.valueOf(DEFAULT_MAX_SHIFT_PERCENT));
        parseOptions.addOption("--num-min-matches", "[int], Minimum # min-mer that must be shared before computing second stage filter. Any sequences below that value are considered non-overlapping.", Integer.valueOf(DEFAULT_NUM_MIN_MATCHES));
        parseOptions.addOption("--num-threads", "[int], nNumber of threads to use for computation. Typically set to #cores.", Integer.valueOf(DEFAULT_NUM_THREADS));
        parseOptions.addOption("--repeat-weight", "[double] Repeat suppression strength for tf-idf weighing. <0.0 do unweighted MinHash (version 1.0), >=1.0 do only the tf weighing. To perform no idf weighting, do no supply -f option. ", Double.valueOf(DEFAULT_REPEAT_WEIGHT));
        parseOptions.addOption("--repeat-idf-scale", "[double] The upper range of the idf (from tf-idf) scale. The full scale will be [1,X], where X is the parameter.", Double.valueOf(DEFAULT_REPEAT_IDF_SCALE));
        parseOptions.addOption("--ordered-kmer-size", "[int] The size of k-mers used in the ordered second stage filter.", Integer.valueOf(DEFAULT_ORDERED_KMER_SIZE));
        parseOptions.addOption("--ordered-sketch-size", "[int] The sketch size for second stage filter.", Integer.valueOf(DEFAULT_ORDERED_SKETCH_SIZE));
        parseOptions.addOption("--min-store-length", "[int], The minimum length of the read that is stored in the box. Used to filter out short reads from FASTA file.", 0);
        parseOptions.addOption("--min-olap-length", "[int], The minimum length of the read that used for overlapping. Used to filter out short reads from FASTA file.", Integer.valueOf(DEFAULT_MIN_OVL_LENGTH));
        parseOptions.addOption("--no-self", "Do not compute the overlaps between sequences inside a box. Should be used when the to and from sequences are coming from different files.", false);
        parseOptions.addOption("--store-full-id", "Store full IDs as seen in FASTA files, rather than storing just the sequence position in the file. Some FASTA files have long IDS, slowing output of results. This options is ignored when using compressed file format. Indexed file (-s) is indexed first, followed by -q files in alphabetical order.", false);
        parseOptions.addOption("--supress-noise", "[int] 0) Does nothing, 1) completely removes any k-mers not specified in the filter file, 2) supresses k-mers not specified in the filter file, similar to repeats. ", 0);
        parseOptions.addOption("--no-tf", "Do not perform the tf weighing, in the tf-idf weighing.", false);
        parseOptions.addOption("--no-rc", "Do not store or do comparison of the reverse compliment strings.", false);
        parseOptions.addOption("--settings", "Set all unset parameters for the default settings. Same defaults are applied to Nanopore and Pacbio reads. 0) None, 1) Default, 2) Fast, 3) Sensitive.", 0);
        if (!parseOptions.process(strArr)) {
            System.exit(0);
        }
        if (parseOptions.get("--settings").getInteger() < 0 || parseOptions.get("--settings").getInteger() > DEFAULT_NUM_MIN_MATCHES) {
            System.out.println("Please enter valid --settings flag. See options below:");
            System.out.println(parseOptions.helpMenuString());
            System.exit(1);
        }
        if (parseOptions.get("--settings").getInteger() == 1) {
            if (!parseOptions.get("-k").isSet()) {
                parseOptions.setOptions("-k", 16);
            }
            if (!parseOptions.get("--num-min-matches").isSet()) {
                parseOptions.setOptions("--num-min-matches", Integer.valueOf(DEFAULT_NUM_MIN_MATCHES));
            }
            if (!parseOptions.get("--num-hashes").isSet()) {
                parseOptions.setOptions("--num-hashes", Integer.valueOf(DEFAULT_NUM_WORDS));
            }
            if (!parseOptions.get("--threshold").isSet()) {
                parseOptions.setOptions("--threshold", Double.valueOf(DEFAULT_OVERLAP_ACCEPT_SCORE));
            }
            if (!parseOptions.get("--ordered-sketch-size").isSet()) {
                parseOptions.setOptions("--ordered-sketch-size", Integer.valueOf(DEFAULT_ORDERED_SKETCH_SIZE));
            }
            if (!parseOptions.get("--ordered-kmer-size").isSet()) {
                parseOptions.setOptions("--ordered-kmer-size", Integer.valueOf(DEFAULT_ORDERED_KMER_SIZE));
            }
        } else if (parseOptions.get("--settings").getInteger() == 2) {
            if (!parseOptions.get("-k").isSet()) {
                parseOptions.setOptions("-k", 16);
            }
            if (!parseOptions.get("--num-min-matches").isSet()) {
                parseOptions.setOptions("--num-min-matches", Integer.valueOf(DEFAULT_NUM_MIN_MATCHES));
            }
            if (!parseOptions.get("--num-hashes").isSet()) {
                parseOptions.setOptions("--num-hashes", 256);
            }
            if (!parseOptions.get("--threshold").isSet()) {
                parseOptions.setOptions("--threshold", Double.valueOf(0.8d));
            }
            if (!parseOptions.get("--ordered-sketch-size").isSet()) {
                parseOptions.setOptions("--ordered-sketch-size", 1000);
            }
            if (!parseOptions.get("--ordered-kmer-size").isSet()) {
                parseOptions.setOptions("--ordered-kmer-size", 14);
            }
        } else if (parseOptions.get("--settings").getInteger() == DEFAULT_NUM_MIN_MATCHES) {
            if (!parseOptions.get("-k").isSet()) {
                parseOptions.setOptions("-k", 16);
            }
            if (!parseOptions.get("--num-min-matches").isSet()) {
                parseOptions.setOptions("--num-min-matches", 2);
            }
            if (!parseOptions.get("--num-hashes").isSet()) {
                parseOptions.setOptions("--num-hashes", 768);
            }
            if (!parseOptions.get("--threshold").isSet()) {
                parseOptions.setOptions("--threshold", Double.valueOf(0.73d));
            }
            if (!parseOptions.get("--ordered-sketch-size").isSet()) {
                parseOptions.setOptions("--ordered-sketch-size", Integer.valueOf(DEFAULT_ORDERED_SKETCH_SIZE));
            }
            if (!parseOptions.get("--ordered-kmer-size").isSet()) {
                parseOptions.setOptions("--ordered-kmer-size", Integer.valueOf(DEFAULT_ORDERED_KMER_SIZE));
            }
        }
        if (parseOptions.get("-s").getString().isEmpty() && parseOptions.get("-p").getString().isEmpty()) {
            System.out.println("Please set the -s or the -p options. See options below:");
            System.out.println(parseOptions.helpMenuString());
            System.exit(1);
        }
        if (!parseOptions.get("-p").getString().isEmpty() && parseOptions.get("-q").getString().isEmpty()) {
            System.out.println("Please set the -q option. See options below:");
            System.out.println(parseOptions.helpMenuString());
            System.exit(1);
        }
        if (!parseOptions.get("-p").getString().isEmpty() && !new File(parseOptions.get("-p").getString()).exists()) {
            System.out.println("Could not find requested file/folder: " + parseOptions.get("-p").getString());
            System.exit(1);
        }
        if (!parseOptions.get("-s").getString().isEmpty() && !new File(parseOptions.get("-s").getString()).exists()) {
            System.out.println("Could not find requested file/folder: " + parseOptions.get("-s").getString());
            System.exit(1);
        }
        if (!parseOptions.get("-q").getString().isEmpty() && !new File(parseOptions.get("-q").getString()).exists()) {
            System.out.println("Could not find requested file/folder: " + parseOptions.get("-q").getString());
            System.exit(1);
        }
        if (!parseOptions.get("-f").getString().isEmpty() && !new File(parseOptions.get("-f").getString()).exists()) {
            System.out.println("Could not find requested file/folder: " + parseOptions.get("-f").getString());
            System.exit(1);
        }
        if (parseOptions.get("--num-threads").getInteger() <= 0) {
            System.out.println("Number of threads must be positive.");
            System.exit(1);
        }
        if (parseOptions.get("-k").getInteger() <= 0) {
            System.out.println("k-mer size must be positive.");
            System.exit(1);
        }
        if (parseOptions.get("--num-min-matches").getInteger() <= 0) {
            System.out.println("Minimum number of matches must be positive.");
            System.exit(1);
        }
        if (parseOptions.get("--min-store-length").getInteger() < 0) {
            System.out.println("The minimum read length stored must be >=0.");
            System.exit(1);
        }
        if (parseOptions.get("--repeat-idf-scale").getDouble() < 1.0d) {
            System.out.println("The minimum repeat idf scale must be >=1.0.");
            System.exit(1);
        }
        if (parseOptions.get("--max-shift").getDouble() < -1.0d) {
            System.out.println("The minimum shift must be greater than -1.");
            System.exit(1);
        }
        if (parseOptions.get("--threshold").getDouble() < 0.0d || parseOptions.get("--threshold").getDouble() > 1.0d) {
            System.out.println("The second stage filter threshold must be 0<=threshold<=1.0.");
            System.exit(1);
        }
        if (parseOptions.get("--supress-noise").getInteger() < 0 || parseOptions.get("--supress-noise").getInteger() > 2) {
            System.out.println("The --supress-noise parameter must be in [0,2].");
            System.exit(1);
        }
        if (parseOptions.get("--store-full-id").getBoolean()) {
            SequenceId.STORE_FULL_ID = true;
        } else {
            SequenceId.STORE_FULL_ID = false;
        }
        System.err.println("Running with these settings:");
        System.err.println(parseOptions);
        new MhapMain(parseOptions).computeMain();
    }

    public MhapMain(ParseOptions parseOptions) throws IOException {
        this.processFile = parseOptions.get("-p").getString();
        this.inFile = parseOptions.get("-s").getString();
        this.toFile = parseOptions.get("-q").getString();
        this.noSelf = parseOptions.get("--no-self").getBoolean();
        this.numThreads = parseOptions.get("--num-threads").getInteger();
        this.numHashes = parseOptions.get("--num-hashes").getInteger();
        this.kmerSize = parseOptions.get("-k").getInteger();
        this.numMinMatches = parseOptions.get("--num-min-matches").getInteger();
        this.minStoreLength = parseOptions.get("--min-store-length").getInteger();
        this.minOlapLength = parseOptions.get("--min-olap-length").getInteger();
        this.maxShift = parseOptions.get("--max-shift").getDouble();
        this.acceptScore = parseOptions.get("--threshold").getDouble();
        this.repeatWeight = parseOptions.get("--repeat-weight").getDouble();
        this.orderedKmerSize = parseOptions.get("--ordered-kmer-size").getInteger();
        this.orderedSketchSize = parseOptions.get("--ordered-sketch-size").getInteger();
        this.doReverseCompliment = !parseOptions.get("--no-rc").getBoolean();
        String string = parseOptions.get("-f").getString();
        if (string.isEmpty()) {
            this.kmerFilter = null;
            return;
        }
        long nanoTime = System.nanoTime();
        System.err.println("Reading in filter file " + string + ".");
        try {
            double d = 0.0d;
            if (this.repeatWeight >= 0.0d && this.repeatWeight < 1.0d) {
                d = this.repeatWeight;
            }
            double d2 = parseOptions.get("--filter-threshold").getDouble();
            int integer = parseOptions.get("--supress-noise").getInteger();
            boolean z = parseOptions.get("--no-tf").getBoolean();
            double d3 = parseOptions.get("--repeat-idf-scale").getDouble();
            BufferedReader file = Utils.getFile(string, null);
            Throwable th = null;
            try {
                try {
                    this.kmerFilter = new FrequencyCounts(file, d2, d, integer, z, this.numThreads, d3, this.doReverseCompliment);
                    if (file != null) {
                        if (0 != 0) {
                            try {
                                file.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            file.close();
                        }
                    }
                    System.err.println("Time (s) to read filter file: " + ((System.nanoTime() - nanoTime) * 1.0E-9d));
                    if (this.kmerFilter != null) {
                        System.err.println("Read in k-mer filter for sizes: " + this.kmerFilter.getKmerSizes());
                    }
                } finally {
                }
            } catch (Throwable th3) {
                th = th3;
                throw th3;
            }
        } catch (Exception e) {
            throw new MhapRuntimeException("Could not parse k-mer filter file.", e);
        }
    }

    public void computeMain() throws IOException {
        long nanoTime = System.nanoTime();
        System.nanoTime();
        long nanoTime2 = System.nanoTime();
        if (this.processFile != null && !this.processFile.isEmpty()) {
            System.err.println("Processing FASTA files for binary compression...");
            File file = new File(this.processFile);
            if (!file.exists()) {
                throw new MhapRuntimeException("Process file does not exist.");
            }
            if (this.toFile == null || this.toFile.isEmpty()) {
                throw new MhapRuntimeException("Target directory must be defined.");
            }
            File file2 = new File(this.toFile);
            if (!file2.exists() || !file2.isDirectory()) {
                throw new MhapRuntimeException("Target directory doesn't exit.");
            }
            ArrayList arrayList = new ArrayList();
            if (file.isDirectory()) {
                File[] listFiles = file.listFiles((file3, str) -> {
                    return !str.startsWith(".");
                });
                if (listFiles != null) {
                    for (File file4 : listFiles) {
                        arrayList.add(file4);
                    }
                }
                Collections.sort(arrayList);
            } else {
                arrayList.add(file);
            }
            Iterator it2 = arrayList.iterator();
            while (it2.hasNext()) {
                File file5 = (File) it2.next();
                long nanoTime3 = System.nanoTime();
                SequenceSketchStreamer sequenceHashStreamer = getSequenceHashStreamer(file5.getAbsolutePath(), 0);
                String name = file5.getName();
                int lastIndexOf = name.lastIndexOf(46);
                if (lastIndexOf > 0) {
                    name = name.substring(0, lastIndexOf);
                }
                String str2 = file2.getPath() + File.separator + name + ".dat";
                sequenceHashStreamer.writeToBinary(str2, false, this.numThreads);
                System.err.println("Processed " + sequenceHashStreamer.getNumberProcessed() + " sequences (fwd and rev).");
                System.err.println("Read, hashed, and stored file " + file5.getPath() + " to " + str2 + ".");
                System.err.println("Time (s): " + ((System.nanoTime() - nanoTime3) * 1.0E-9d));
            }
            System.err.println("Total time (s): " + ((System.nanoTime() - nanoTime) * 1.0E-9d));
            return;
        }
        System.err.println("Processing files for storage in reverse index...");
        SequenceSketchStreamer sequenceHashStreamer2 = getSequenceHashStreamer(this.inFile, 0);
        MinHashSearch matchSearch = getMatchSearch(sequenceHashStreamer2);
        int numberProcessed = 0 + (sequenceHashStreamer2.getNumberProcessed() / 2);
        System.err.println("Processed " + sequenceHashStreamer2.getNumberProcessed() + " unique sequences (fwd and rev).");
        System.err.println("Time (s) to read and hash from file: " + ((System.nanoTime() - nanoTime2) * 1.0E-9d));
        long nanoTime4 = System.nanoTime();
        if (this.toFile == null || this.toFile.isEmpty()) {
            long nanoTime5 = System.nanoTime();
            matchSearch.findMatches();
            System.err.println("Time (s) to score and output to self: " + ((System.nanoTime() - nanoTime5) * 1.0E-9d));
        } else {
            File file6 = new File(this.toFile);
            if (!file6.exists()) {
                throw new MhapRuntimeException("To-file does not exist.");
            }
            ArrayList arrayList2 = new ArrayList();
            if (file6.isDirectory()) {
                for (File file7 : file6.listFiles(new FilenameFilter() { // from class: edu.umd.marbl.mhap.main.MhapMain.1
                    @Override // java.io.FilenameFilter
                    public boolean accept(File file8, String str3) {
                        return !str3.startsWith(".");
                    }
                })) {
                    arrayList2.add(file7);
                }
            } else {
                arrayList2.add(file6);
            }
            Collections.sort(arrayList2);
            long nanoTime6 = System.nanoTime();
            if (!this.noSelf) {
                matchSearch.findMatches();
                System.out.flush();
                System.err.println("Time (s) to score and output to self: " + ((System.nanoTime() - nanoTime6) * 1.0E-9d));
            }
            Iterator it3 = arrayList2.iterator();
            while (it3.hasNext()) {
                File file8 = (File) it3.next();
                SequenceSketchStreamer sequenceHashStreamer3 = getSequenceHashStreamer(file8.getAbsolutePath(), numberProcessed);
                System.err.println("Opened fasta file " + file8.getCanonicalPath() + ".");
                long nanoTime7 = System.nanoTime();
                matchSearch.findMatches(sequenceHashStreamer3);
                System.out.flush();
                numberProcessed += sequenceHashStreamer3.getNumberProcessed();
                System.err.println("Processed " + sequenceHashStreamer3.getNumberProcessed() + " to sequences.");
                System.err.println("Time (s) to score, hash to-file, and output: " + ((System.nanoTime() - nanoTime7) * 1.0E-9d));
            }
        }
        System.out.flush();
        System.err.println("Total scoring time (s): " + ((System.nanoTime() - nanoTime4) * 1.0E-9d));
        System.err.println("Total time (s): " + ((System.nanoTime() - nanoTime) * 1.0E-9d));
        outputFinalStat(matchSearch);
    }

    public MinHashSearch getMatchSearch(SequenceSketchStreamer sequenceSketchStreamer) throws IOException {
        return new MinHashSearch(sequenceSketchStreamer, this.numHashes, this.numMinMatches, this.numThreads, false, this.minStoreLength, this.maxShift, this.acceptScore, this.doReverseCompliment);
    }

    public SequenceSketchStreamer getSequenceHashStreamer(String str, int i) throws IOException {
        return str.endsWith(".dat") ? new SequenceSketchStreamer(str, this.minOlapLength, i) : new SequenceSketchStreamer(str, this.minOlapLength, this.kmerSize, this.numHashes, this.orderedKmerSize, this.orderedSketchSize, this.kmerFilter, this.doReverseCompliment, this.repeatWeight, i);
    }

    protected void outputFinalStat(MinHashSearch minHashSearch) {
        System.err.println("MinHash search time (s): " + minHashSearch.getMinHashSearchTime());
        System.err.println("Total matches found: " + minHashSearch.getMatchesProcessed());
        System.err.println("Average number of matches per lookup: " + (minHashSearch.getMatchesProcessed() / minHashSearch.getNumberSequencesSearched()));
        System.err.println("Average number of table elements processed per lookup: " + (minHashSearch.getNumberElementsProcessed() / minHashSearch.getNumberSequencesSearched()));
        System.err.println("Average number of table elements processed per match: " + (minHashSearch.getNumberElementsProcessed() / minHashSearch.getMatchesProcessed()));
        System.err.println("Average % of hashed sequences hit per lookup: " + ((minHashSearch.getNumberSequencesHit() / (minHashSearch.size() * minHashSearch.getNumberSequencesSearched())) * 100.0d));
        System.err.println("Average % of hashed sequences hit that are matches: " + ((minHashSearch.getMatchesProcessed() / minHashSearch.getNumberSequencesHit()) * 100.0d));
        System.err.println("Average % of hashed sequences fully compared that are matches: " + ((minHashSearch.getMatchesProcessed() / minHashSearch.getNumberSequencesFullyCompared()) * 100.0d));
        System.err.flush();
    }
}
