diff --git a/gortools/src/test/java/gorsat/UTestCram.java b/gortools/src/test/java/gorsat/UTestCram.java index 7d32d0cb0..9b1d20bc9 100644 --- a/gortools/src/test/java/gorsat/UTestCram.java +++ b/gortools/src/test/java/gorsat/UTestCram.java @@ -31,12 +31,17 @@ import org.junit.Assert; import org.junit.Rule; import org.junit.Test; +import org.junit.contrib.java.lang.system.RestoreSystemProperties; import org.junit.rules.TemporaryFolder; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Paths; +import java.util.List; + +import static gorsat.TestUtils.LINE_SPLIT_PATTERN; +import static org.gorpipe.gor.driver.providers.stream.datatypes.cram.CramIterator.KEY_REFERENCE_FORCE_FOLDER; public class UTestCram { @@ -45,6 +50,9 @@ public class UTestCram { @Rule public TemporaryFolder workDir = new TemporaryFolder(); + @Rule + public RestoreSystemProperties restoreSystemProperties = new RestoreSystemProperties(); + public static File createWrongConfigFile(File directory) throws IOException { return FileTestUtils.createTempFile(directory, "generic.gor", "buildPath\t../tests/data/ref_mini/chromSeq\n" + @@ -106,7 +114,7 @@ public void readCramWithFastaReferenceFromConfig() { public void readCramWithFastaReferenceFromConfigException() throws IOException { File wrongConfigFile = createWrongConfigFile(workDir.getRoot()); System.clearProperty("gor.driver.cram.fastareferencesource"); - String[] args = new String[]{ + String[] args = new String[] { "gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM), "-config", wrongConfigFile.getCanonicalPath()}; @@ -120,24 +128,29 @@ public void readCramWithFastaReferenceFromConfigException() throws IOException { @Test public void readCramWithFastaReferenceAndGenerateMissingAttributes() { - try { - System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA)); - System.setProperty("gor.driver.cram.generatemissingattributes", "false"); - String[] linesWithoutMissingAttributes = TestUtils.runGorPipeLines("gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)); - System.setProperty("gor.driver.cram.generatemissingattributes", "true"); - String[] linesWithMissingAttributes = TestUtils.runGorPipeLines("gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)); - - Assert.assertEquals(8, linesWithoutMissingAttributes.length); - Assert.assertEquals(8, linesWithMissingAttributes.length); - // See if we have the missing entry in the last column. - Assert.assertFalse(linesWithoutMissingAttributes[1].contains("NM=")); - Assert.assertTrue(linesWithMissingAttributes[1].contains("NM=")); + System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA)); + System.setProperty(KEY_REFERENCE_FORCE_FOLDER, "false"); - } finally { - System.clearProperty("gor.driver.cram.fastareferencesource"); - System.clearProperty("gor.driver.cram.generatemissingattributes"); - } + String[] args = new String[] {"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)}; + + System.setProperty("gor.driver.cram.generatemissingattributes", "false"); + String[] linesWithoutMissingAttributes = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN); + + System.setProperty("gor.driver.cram.generatemissingattributes", "true"); + String[] linesWithMissingAttributesCramRef = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN); + + args = new String[] { + "gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM) + , "-config", "../tests/data/ref_mini/gor_config.txt"}; + String[] linesWithMissingAttributesProjectRef = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN); + Assert.assertEquals(8, linesWithoutMissingAttributes.length); + Assert.assertEquals(8, linesWithMissingAttributesCramRef.length); + Assert.assertEquals(8, linesWithMissingAttributesProjectRef.length); + // See if we have the missing entry in the last column. + Assert.assertFalse(linesWithoutMissingAttributes[1].contains("NM=")); + Assert.assertTrue(linesWithMissingAttributesCramRef[1].contains("NM=")); + Assert.assertTrue(linesWithMissingAttributesProjectRef[1].contains("NM=")); } @Test(expected = GorResourceException.class) diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java index b19c8bdcb..fe31c7a7f 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java @@ -32,17 +32,19 @@ import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; import org.gorpipe.exceptions.GorResourceException; -import org.gorpipe.gor.driver.meta.DataType; import org.gorpipe.gor.driver.providers.stream.datatypes.bam.BamIterator; +import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.CompositeReferenceSource; +import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.EBIReferenceSource; +import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.FolderReferenceSource; +import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.SharedFastaReferenceSource; import org.gorpipe.gor.model.ChromoLookup; import org.gorpipe.gor.session.GorSession; import org.gorpipe.gor.driver.adapters.StreamSourceSeekableStream; import org.gorpipe.gor.driver.providers.stream.sources.StreamSource; import org.gorpipe.gor.table.util.PathUtils; -import org.gorpipe.gor.util.DataUtil; -import org.gorpipe.gor.util.StringUtil; import org.gorpipe.gor.model.Row; -import org.gorpipe.gor.model.SharedFastaReferenceSource; +import org.gorpipe.model.gor.iterators.RefSeq; +import org.gorpipe.util.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -66,8 +68,9 @@ */ public class CramIterator extends BamIterator { - private final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes"; - private final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource"; + public final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes"; + public final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource"; + public final static String KEY_REFERENCE_FORCE_FOLDER = "gor.driver.cram.reference.force.folder."; private static final Logger log = LoggerFactory.getLogger(CramIterator.class); @@ -75,10 +78,11 @@ public class CramIterator extends BamIterator { private int[] columns; ChromoLookup lookup; private String fileName; - private String cramReferencePath = ""; - private CRAMFileReader cramFileReader; - private ReferenceSequenceFile referenceSequenceFile; + private String projectCramReferencePath; // Cram reference path from project context. + private RefSeq projectRefSeq; // RefSeq from project context, used for MD tag calculation. + private ReferenceSequenceFile referenceSequenceFile; // Handle to cram reference file, fallback for MD tag calculation. private CRAMReferenceSource referenceSource; + private CRAMFileReader cramFileReader; private boolean generateMissingCramAttributes; /** @@ -94,35 +98,6 @@ public CramIterator(ChromoLookup lookup, CramFile cramFile, int[] columns) { this.lookup = lookup; } - - /** - * Construct a CramIterator - * - * @param lookup The lookup service for chromosome name to ids - * @param file The CRAM File to iterate through - */ - public CramIterator(ChromoLookup lookup, String file, String index, String reference, boolean generateMissingAttributes) { - - fileName = file; - generateMissingCramAttributes = generateMissingAttributes; - File cramfile = new File(file); - File cramindex = new File(index); - if (!cramindex.exists()) { - cramindex = new File(DataUtil.toFile(file, DataType.CRAI)); - } - - referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(reference)); - referenceSource = createReferenceSource(fileName, ""); - - try { - cramFileReader = new CRAMFileReader(cramfile, new FileInputStream(cramindex), referenceSource); - } catch (FileNotFoundException e) { - throw new GorResourceException("Cram file not found.", file, e); - } - SamReader samreader = new SamReader.PrimitiveSamReaderToSamReaderAdapter(cramFileReader, null); - init(lookup, samreader, true); - } - @Override public Row next() { Row row = super.next(); @@ -135,8 +110,16 @@ public Row next() { boolean calculateNM = record.getIntegerAttribute(SAMTag.NM.name()) == null; if (calculateMD) { - byte[] referenceBytes = referenceSequenceFile.getSubsequenceAt(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBases(); - CramUtils.calculateMdAndNmTags(record, referenceBytes, calculateMD, calculateNM); + byte[] referenceBytes = null; + if (projectRefSeq != null) { + referenceBytes = projectRefSeq.getBases(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBytes(); + } else if (referenceSequenceFile != null) { + // Fallback to the reference file used by the CRAM reader + referenceBytes = referenceSequenceFile.getSubsequenceAt(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBases(); + } + if (referenceBytes != null) { + CramUtils.calculateMdAndNmTags(record, referenceBytes, calculateMD, calculateNM); + } } else if (calculateNM) { SequenceUtil.calculateSamNmTagFromCigar(record); } @@ -170,7 +153,10 @@ public void init(GorSession session) { return; } - cramReferencePath = session.getProjectContext().getReferenceBuild().getCramReferencePath(); + projectCramReferencePath = session.getProjectContext().getReferenceBuild().getCramReferencePath(); + if (!Strings.isNullOrEmpty(session.getProjectContext().getGorConfigFile())) { + projectRefSeq = session.getProjectContext().createRefSeq(); + } if (cramFile != null) { // I read this property here through System.getProperty as there is no other way to pass properties to the driver @@ -237,7 +223,7 @@ private CRAMReferenceSource createReferenceSource(String ref, String root) { } // This reference should be fasta but we let the htsjdk library decide - return createFileReference(file.toString()); + return createFileReference(file); } private File getReferenceFromGorOptions(File file) { @@ -252,8 +238,8 @@ private File getReferenceFromGorOptions(File file) { } private File getReferenceFromGorConfig(File file, String root) { - if (!file.exists() && !StringUtil.isEmpty(cramReferencePath)) { - return PathUtils.resolve(Paths.get(root), Paths.get(cramReferencePath)).toFile(); + if (!file.exists() && !Strings.isNullOrEmpty(projectCramReferencePath)) { + return PathUtils.resolve(Paths.get(root), Paths.get(projectCramReferencePath)).toFile(); } return file; } @@ -277,10 +263,22 @@ private File getReferenceFromReferenceLinkFile(File file) { return file; } - private CRAMReferenceSource createFileReference(String ref) { - String referenceKey = FilenameUtils.removeExtension(FilenameUtils.getBaseName(ref)); - referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(ref)); - return new SharedFastaReferenceSource(referenceSequenceFile, referenceKey); + private CRAMReferenceSource createFileReference(File refFile) { + if (refFile.isDirectory()) { + return new CompositeReferenceSource(List.of( + new FolderReferenceSource(refFile.getPath()), + new EBIReferenceSource(refFile.getPath()))); + } else if (Boolean.getBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) { + return new CompositeReferenceSource(List.of( + new FolderReferenceSource(refFile.getParent()), + new EBIReferenceSource(refFile.getParent()))); + } else { + referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); + + String referenceKey = FilenameUtils.removeExtension(refFile.getName()); + var referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); + return new SharedFastaReferenceSource(referenceFile, referenceKey); + } } } diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/CompositeReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/CompositeReferenceSource.java new file mode 100644 index 000000000..963656f78 --- /dev/null +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/CompositeReferenceSource.java @@ -0,0 +1,55 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.cram.ref.CRAMReferenceSource; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Composite reference source, tries out different reference source in order. + */ +public class CompositeReferenceSource implements CRAMReferenceSource, Closeable { + + List sources; + + public CompositeReferenceSource(List sources) { + this.sources = sources != null ? new ArrayList<>(sources) : new ArrayList<>(); + } + + @Override + public byte[] getReferenceBases(SAMSequenceRecord sequenceRecord, boolean tryNameVariants) { + byte[] bytes = null; + for (var source : sources) { + bytes = source.getReferenceBases(sequenceRecord, tryNameVariants); + if (bytes != null) { + return bytes; + } + } + return bytes; + } + + @Override + public byte[] getReferenceBasesByRegion(SAMSequenceRecord sequenceRecord, int zeroBasedStart, int requestedRegionLength) { + byte[] bytes = null; + for (var source : sources) { + bytes = source.getReferenceBasesByRegion(sequenceRecord, zeroBasedStart, requestedRegionLength); + if (bytes != null) { + return bytes; + } + } + return bytes; + } + + @Override + public void close() throws IOException { + for (var source : sources) { + if (source instanceof Closeable) { + ((Closeable) source).close(); + } + } + sources.clear(); + } +} diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java new file mode 100644 index 000000000..d12dbfac7 --- /dev/null +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java @@ -0,0 +1,155 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.Defaults; +import htsjdk.samtools.SAMSequenceRecord; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.exceptions.GorResourceException; +import org.gorpipe.gor.table.util.PathUtils; +import org.gorpipe.util.Strings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Get reference sources from the EBI service. Optionally caching them locally for future reference. + * The downloaded references sequences are stored in the provided folder as md5_.txt files. + */ +public class EBIReferenceSource extends MD5CachedReferenceSource { + private static final Logger log = LoggerFactory.getLogger(EBIReferenceSource.class); + + public static final String KEY_USE_CRAM_REF_DOWNLOAD = "gor.driver.cram.ref.download"; + + private static final String REFBASES_PREFIX = "md5_"; + private static final String REFBASES_EXT = ".txt"; + + protected static Map md5ToRefbases = new ConcurrentHashMap<>(); + + private Path referenceFolder; // If null we do not download. + + public EBIReferenceSource() { + } + + public EBIReferenceSource(String referenceFolder) { + if (!Strings.isNullOrEmpty(referenceFolder)) { + this.referenceFolder = Path.of(referenceFolder); + + if (!Files.isDirectory(this.referenceFolder)) { + throw new GorResourceException("Can not create FolderReferenceSource %s as the target is not a folder or does not exists".formatted(referenceFolder), referenceFolder); + } + scanReferenceFolder(); + } + } + + Set getRefbasesFiles() { + return new HashSet<>(md5ToRefbases.values()); + } + + private void scanReferenceFolder() { + md5ToRefbases.clear(); + + if (referenceFolder == null) return; + + try { + for (var p : Files.list(referenceFolder).filter(Files::isRegularFile).toList()) { + var f = p.getFileName().toString().toLowerCase(); + if (f.startsWith(REFBASES_PREFIX) && f.endsWith(REFBASES_EXT)) { + processRefbasesFile(referenceFolder.resolve(f)); + } + } + } catch (IOException e) { + log.warn("Failed scanning reference folder {}", referenceFolder, e); + } + } + + private void processRefbasesFile(Path refbases) { + String fileName = refbases.getFileName().toString(); + if (!fileName.startsWith(REFBASES_PREFIX) || !fileName.endsWith(REFBASES_EXT)) return; + + String md5 = fileName.substring(REFBASES_PREFIX.length(), fileName.length() - REFBASES_EXT.length()); + md5ToRefbases.put(md5, refbases); + } + + @Override + protected byte[] loadReference(final SAMSequenceRecord record) { + // Load from refbases file. + Path refbasesPath = md5ToRefbases.get(record.getMd5()); + if (refbasesPath != null) { + try { + byte[] bases = Files.readAllBytes(refbasesPath); + if (bases == null || bases.length == 0) { + throw new GorDataException("Reference sequence in " + refbasesPath + " is empty"); + } + return bases; + } catch (IOException e) { + throw new GorDataException("Could not read refbases file %s".formatted(refbasesPath), e); + } + } + + // Load from EBI service. + if (Boolean.parseBoolean(System.getProperty(KEY_USE_CRAM_REF_DOWNLOAD, + Boolean.toString(Defaults.USE_CRAM_REF_DOWNLOAD)))) { + try { + + byte[] bases = downloadFromEBI(record.getMd5()); + if (bases != EMPTY_BASES) { + saveRefbasesToDisk(record.getMd5(), bases); + } + return bases; + } catch (IOException e) { + log.warn("Could not download/save reference sequence for md5 " + record.getMd5(), e); + } + } + + return EMPTY_BASES; + } + + /** + * Download reference sequence from EBI by MD5 and store it in the reference folder. + * @param md5 + * @return bytes of the reference sequence, null if not found. + * @throws IOException + */ + private byte[] downloadFromEBI(String md5) throws IOException { + log.info("Downloading reference {} from ENA", md5); + URL url = new URL(String.format(Defaults.EBI_REFERENCE_SERVICE_URL_MASK, md5)); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setConnectTimeout(15000); + conn.setReadTimeout(30000); + conn.setRequestMethod("GET"); + + if (conn.getResponseCode() != 200) { + log.warn("ENA returned {} for {}", conn.getResponseCode(), md5); + return EMPTY_BASES; + } + + byte[] bases; + try (BufferedInputStream in = new BufferedInputStream(conn.getInputStream())) { + bases = in.readAllBytes(); + } + if (bases.length == 0) return EMPTY_BASES; + + return bases; + } + + private void saveRefbasesToDisk(String md5, byte[] bases) throws IOException { + if (referenceFolder == null) return; + + Path basesPath = referenceFolder.resolve(REFBASES_PREFIX + md5 + REFBASES_EXT); + Path tempBasesPath = PathUtils.getTempFilePath(basesPath); + + Files.write(tempBasesPath, bases); + Files.move(tempBasesPath, basesPath); + + md5ToRefbases.put(md5, basesPath); + } +} diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java new file mode 100644 index 000000000..127b3ebd3 --- /dev/null +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java @@ -0,0 +1,125 @@ +/* + * BEGIN_COPYRIGHT + * + * Copyright (C) 2011-2013 deCODE genetics Inc. + * Copyright (C) 2013-2019 WuXi NextCode Inc. + * All Rights Reserved. + * + * GORpipe is free software: you can redistribute it and/or modify + * it under the terms of the AFFERO GNU General Public License as published by + * the Free Software Foundation. + * + * GORpipe is distributed "AS-IS" AND WITHOUT ANY WARRANTY OF ANY KIND, + * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, + * NON-INFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. See + * the AFFERO GNU General Public License for the complete license terms. + * + * You should have received a copy of the AFFERO GNU General Public License + * along with GORpipe. If not, see + * + * END_COPYRIGHT + */ + +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileFactory; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.exceptions.GorResourceException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Reference source that resolves references by MD5 using a folder of FASTA files (with .dict and .fai) + * + */ + +public class FolderReferenceSource extends MD5CachedReferenceSource { + + private static final Logger log = LoggerFactory.getLogger(FolderReferenceSource.class); + + private static final Set FASTA_EXT = Set.of("fa", "fasta"); + + private record Md5ReferencePath(String md5, Path path, String contig) {} + protected static Map md5ToReferencePath = new ConcurrentHashMap<>(); + + private Path referenceFolder; + + private final Map refFileByPath = new ConcurrentHashMap<>(); + + public FolderReferenceSource(String referenceFolder) { + this.referenceFolder = Path.of(referenceFolder); + if (!Files.isDirectory(this.referenceFolder)) { + throw new GorResourceException("Can not create FolderReferenceSource %s as the target is not a folder or does not exists".formatted(referenceFolder), referenceFolder); + } + scanReferenceFolder(); + } + + @Override + protected byte[] loadReference(final SAMSequenceRecord record) { + // Load form fasta file. + Md5ReferencePath referencePath = md5ToReferencePath.get(record.getMd5()); + if (referencePath != null) { + ReferenceSequenceFile rsFile = refFileByPath.computeIfAbsent(referencePath.path(), ReferenceSequenceFileFactory::getReferenceSequenceFile); + if (rsFile == null) { + throw new GorDataException(String.format("Reference file %s for md5 %s not found in reference folder %s", + referencePath, referencePath.md5(), referenceFolder)); + } + + return rsFile.getSequence(referencePath.contig()).getBases(); + } + + return EMPTY_BASES; + } + + private void scanReferenceFolder() { + md5ToReferencePath.clear(); + + try { + for (var p : Files.list(referenceFolder).filter(Files::isRegularFile).toList()) { + var f = p.getFileName().toString().toLowerCase(); + if (FASTA_EXT.stream().anyMatch(ext -> f.endsWith("." + ext))) { + processFasta(referenceFolder.resolve(f)); + } + } + } catch (IOException e) { + log.warn("Failed scanning reference folder {}", referenceFolder, e); + } + } + + private void processFasta(Path fastaFile) { + ReferenceSequenceFile refFile = refFileByPath.computeIfAbsent(fastaFile, ReferenceSequenceFileFactory::getReferenceSequenceFile); + SAMSequenceDictionary dictionary = refFile.getSequenceDictionary(); + if (dictionary == null) { + throw new GorResourceException("Fasta file %s is invalid cram reference as it is missing dict file".formatted(fastaFile), fastaFile.toString()); + } + for (SAMSequenceRecord rec : dictionary.getSequences()) { + String md5 = rec.getMd5(); + if (md5 == null || md5.isEmpty()) continue; + md5ToReferencePath.put(md5, new FolderReferenceSource.Md5ReferencePath(md5, fastaFile, rec.getContig())); + } + } + + @Override + public void close() { + refFileByPath.values().forEach(r -> { + try { + r.close(); + } catch (IOException ignored) { + } + }); + refFileByPath.clear(); + } + + Set getReferenceFiles() { + return new HashSet<>(refFileByPath.keySet()); + } +} diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java new file mode 100644 index 000000000..ccd6c6180 --- /dev/null +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java @@ -0,0 +1,88 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.RemovalListener; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.cram.ref.CRAMReferenceSource; +import htsjdk.samtools.util.StringUtil; +import htsjdk.utils.ValidationUtils; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.util.Strings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; + +public abstract class MD5CachedReferenceSource implements CRAMReferenceSource, Closeable { + private static final Logger log = LoggerFactory.getLogger(MD5CachedReferenceSource.class); + + public static final byte[] EMPTY_BASES = new byte[0]; + + private static final Cache md5BasesCache = createMd5BasesCache(); + + private static Cache createMd5BasesCache () { + RemovalListener removalNotifier; + removalNotifier = notification -> log.debug("Removing from cache, key: {}, cause: {}", notification.getKey(), notification.getCause()); + + // TODO: Move this to owner config or context object when the dirver model supports that + Integer cacheTimout = Integer.parseInt(System.getProperty("gor.driver.cram.referencetimeout", "60")); // Seconds + + return CacheBuilder.newBuilder().removalListener(removalNotifier) + .expireAfterAccess(cacheTimout, TimeUnit.SECONDS).build(); + } + + @Override + public synchronized byte[] getReferenceBases(final SAMSequenceRecord record, + final boolean tryNameVariants) { + // check cache by sequence name: + final String md5 = record.getMd5(); + + if (Strings.isNullOrEmpty(md5)) { + throw new GorDataException("Can not load reference bases as SAMSequenceRecord does not contain MD5"); + } + + try { + var bases = md5BasesCache.get(md5, () -> loadReference(record)); + StringUtil.toUpperCase(bases); + return bases != EMPTY_BASES ? bases : null; + }catch (ExecutionException e) { + throw new GorDataException("Failed to load CRAM reference: " + md5, e); + } + } + + @Override + public byte[] getReferenceBasesByRegion( + final SAMSequenceRecord sequenceRecord, + final int zeroBasedStart, + final int requestedRegionLength) { + + ValidationUtils.validateArg(zeroBasedStart >= 0, "start must be >= 0"); + byte[] bases = getReferenceBases(sequenceRecord, false); + if (bases != null) { + if (zeroBasedStart >= bases.length) { + throw new IllegalArgumentException(String.format("Requested start %d is beyond the sequence length %s", + zeroBasedStart, + sequenceRecord.getSequenceName())); + } + return Arrays.copyOfRange(bases, zeroBasedStart, Math.min(bases.length, zeroBasedStart + requestedRegionLength)); + } + return bases; + } + + /** + * Load reference by MD5 from disk, downloading from EBI ENA if needed. + * @param record SAM record with the sequence detail (must include MD5). + * @return the bases, or EMPTY_BASES if no bases where found on disk and on EBI. + */ + protected abstract byte[] loadReference(SAMSequenceRecord record); + + @Override + public void close() throws IOException { + + } +} diff --git a/model/src/main/java/org/gorpipe/gor/model/SharedCachedReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedCachedReferenceSource.java similarity index 98% rename from model/src/main/java/org/gorpipe/gor/model/SharedCachedReferenceSource.java rename to model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedCachedReferenceSource.java index c969f5146..4253dd374 100644 --- a/model/src/main/java/org/gorpipe/gor/model/SharedCachedReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedCachedReferenceSource.java @@ -20,7 +20,7 @@ * END_COPYRIGHT */ -package org.gorpipe.gor.model; +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; diff --git a/model/src/main/java/org/gorpipe/gor/model/SharedChromSeqReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedChromSeqReferenceSource.java similarity index 97% rename from model/src/main/java/org/gorpipe/gor/model/SharedChromSeqReferenceSource.java rename to model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedChromSeqReferenceSource.java index e8c768224..eec784619 100644 --- a/model/src/main/java/org/gorpipe/gor/model/SharedChromSeqReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedChromSeqReferenceSource.java @@ -20,7 +20,7 @@ * END_COPYRIGHT */ -package org.gorpipe.gor.model; +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; import htsjdk.samtools.SAMSequenceRecord; import org.gorpipe.gor.driver.meta.DataType; diff --git a/model/src/main/java/org/gorpipe/gor/model/SharedFastaReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedFastaReferenceSource.java similarity index 98% rename from model/src/main/java/org/gorpipe/gor/model/SharedFastaReferenceSource.java rename to model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedFastaReferenceSource.java index 7c98a7103..11011ab1d 100644 --- a/model/src/main/java/org/gorpipe/gor/model/SharedFastaReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedFastaReferenceSource.java @@ -20,7 +20,7 @@ * END_COPYRIGHT */ -package org.gorpipe.gor.model; +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceRecord; diff --git a/model/src/main/java/org/gorpipe/gor/table/util/PathUtils.java b/model/src/main/java/org/gorpipe/gor/table/util/PathUtils.java index fc625756b..d006a0d97 100644 --- a/model/src/main/java/org/gorpipe/gor/table/util/PathUtils.java +++ b/model/src/main/java/org/gorpipe/gor/table/util/PathUtils.java @@ -232,6 +232,10 @@ public static String removeExtensions(String fileName) { return fileName.contains(".") ? fileName.substring(0, fileName.indexOf('.')) : fileName; } + public static String removeLastExtension(String fileName) { + return fileName.contains(".") ? fileName.substring(0, fileName.lastIndexOf('.')) : fileName; + } + public static String markAsFolder(String path) { if (!path.endsWith("/")) { return path + "/"; @@ -335,6 +339,11 @@ public static String getCurrentAbsolutePath() { return Path.of("").toAbsolutePath().toString(); } + /** + * Get a temporary file path in the same folder and with similar name as the given file path. + * @param filePath the file path to base the temp file path on. + * @return the temporary file path. + */ public static Path getTempFilePath(Path filePath) { var baseName = filePath.getFileName().toString(); var dotIndex = baseName.indexOf('.'); diff --git a/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestCompositeReferenceSource.java b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestCompositeReferenceSource.java new file mode 100644 index 000000000..eb7185a8d --- /dev/null +++ b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestCompositeReferenceSource.java @@ -0,0 +1,181 @@ +/* + * BEGIN_COPYRIGHT + * + * Copyright (C) 2011-2013 deCODE genetics Inc. + * Copyright (C) 2013-2019 WuXi NextCode Inc. + * All Rights Reserved. + * + * GORpipe is free software: you can redistribute it and/or modify + * it under the terms of the AFFERO GNU General Public License as published by + * the Free Software Foundation. + * + * GORpipe is distributed "AS-IS" AND WITHOUT ANY WARRANTY OF ANY KIND, + * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, + * NON-INFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. See + * the AFFERO GNU General Public License for the complete license terms. + * + * You should have received a copy of the AFFERO GNU General Public License + * along with GORpipe. If not, see + * + * END_COPYRIGHT + */ + +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.cram.ref.CRAMReferenceSource; +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +import static org.junit.Assert.*; +import static org.mockito.Mockito.*; + +public class UTestCompositeReferenceSource { + + private CRAMReferenceSource mockSource1; + private CRAMReferenceSource mockSource2; + private CRAMReferenceSource mockSource3; + private SAMSequenceRecord testRecord; + private byte[] testBases; + + @Before + public void setUp() { + mockSource1 = mock(CRAMReferenceSource.class); + mockSource2 = mock(CRAMReferenceSource.class); + mockSource3 = mock(CRAMReferenceSource.class); + testRecord = new SAMSequenceRecord("chr1", 1000); + testBases = "ACGTACGT".getBytes(); + } + + @Test + public void getReferenceBasesReturnsResultFromSingleSource() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertArrayEquals(testBases, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesReturnsResultFromFirstSourceWhenMultipleSources() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Arrays.asList(mockSource1, mockSource2)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertArrayEquals(testBases, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, false); + verify(mockSource2, never()).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesFallsBackToSecondSourceWhenFirstFails() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(null); + when(mockSource2.getReferenceBases(testRecord, false)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Arrays.asList(mockSource1, mockSource2)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertArrayEquals(testBases, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, false); + verify(mockSource2, times(1)).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesThrowsExceptionWhenAllSourcesFail() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(null); + when(mockSource2.getReferenceBases(testRecord, false)).thenReturn(null); + when(mockSource3.getReferenceBases(testRecord, false)).thenReturn(null); + + CompositeReferenceSource composite = new CompositeReferenceSource(Arrays.asList(mockSource1, mockSource2, mockSource3)); + + assertEquals(null, composite.getReferenceBases(testRecord, false)); + } + + @Test + public void getReferenceBasesPassesTryNameVariantsParameter() { + when(mockSource1.getReferenceBases(testRecord, true)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + byte[] result = composite.getReferenceBases(testRecord, true); + + assertArrayEquals(testBases, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, true); + } + + + @Test + public void getReferenceBasesThrowsExceptionWhenSourcesListIsEmpty() { + CompositeReferenceSource composite = new CompositeReferenceSource(new ArrayList<>()); + assertEquals(null, composite.getReferenceBases(testRecord, false)); + } + + @Test + public void getReferenceBasesHandlesDifferentSequences() { + SAMSequenceRecord record1 = new SAMSequenceRecord("chr1", 1000); + SAMSequenceRecord record2 = new SAMSequenceRecord("chr2", 2000); + byte[] bases1 = "ACGT".getBytes(); + byte[] bases2 = "TGCA".getBytes(); + + when(mockSource1.getReferenceBases(record1, false)).thenReturn(bases1); + when(mockSource1.getReferenceBases(record2, false)).thenReturn(bases2); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + assertArrayEquals(bases1, composite.getReferenceBases(record1, false)); + assertArrayEquals(bases2, composite.getReferenceBases(record2, false)); + } + + @Test + public void getReferenceBasesReturnsFirstSuccessfulResult() { + byte[] bases1 = "FIRST".getBytes(); + byte[] bases2 = "SECOND".getBytes(); + + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(bases1); + when(mockSource2.getReferenceBases(testRecord, false)).thenReturn(bases2); + + CompositeReferenceSource composite = new CompositeReferenceSource(Arrays.asList(mockSource1, mockSource2)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertArrayEquals(bases1, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, false); + verify(mockSource2, never()).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesCanBeCalledMultipleTimes() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + assertArrayEquals(testBases, composite.getReferenceBases(testRecord, false)); + assertArrayEquals(testBases, composite.getReferenceBases(testRecord, false)); + verify(mockSource1, times(2)).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesHandlesLargeSequences() { + byte[] largeBases = new byte[10000]; + Arrays.fill(largeBases, (byte) 'A'); + + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(largeBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertEquals(largeBases.length, result.length); + assertArrayEquals(largeBases, result); + } +} diff --git a/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestEBIReferenceSource.java b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestEBIReferenceSource.java new file mode 100644 index 000000000..f6e1ccc43 --- /dev/null +++ b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestEBIReferenceSource.java @@ -0,0 +1,245 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceRecord; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.exceptions.GorResourceException; +import org.junit.*; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Tests for FolderReferenceSource class. + * Tests folder scanning, MD5 mapping, and reference base retrieval. + */ +public class UTestEBIReferenceSource { + + @Rule + public TemporaryFolder workDir = new TemporaryFolder(); + + private File testFolder; + private EBIReferenceSource referenceSource; + + @Before + public void setUp() throws IOException { + testFolder = workDir.newFolder("ref_folder"); + System.setProperty(EBIReferenceSource.KEY_USE_CRAM_REF_DOWNLOAD, "true"); + } + + @After + public void tearDown() throws IOException { + if (referenceSource != null) { + referenceSource.close(); + } + } + + @Test + public void testConstructorWithEmptyFolder() throws IOException { + // Test with empty folder + referenceSource = new EBIReferenceSource(testFolder.getAbsolutePath()); + + // Should not throw exception, but map should be empty + Assert.assertNotNull(referenceSource); + Assert.assertEquals(0, referenceSource.getRefbasesFiles().size()); + } + + @Test(expected = GorResourceException.class) + public void testConstructorWithNonExistentFolder() { + // Test with non-existent folder + referenceSource = new EBIReferenceSource("/non/existent/path"); + } + + @Test + public void testConcurrentAccess() throws IOException, InterruptedException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new EBIReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 4); + record.setAttribute("M5", "md5_hash"); + + // Test concurrent access + Thread[] threads = new Thread[10]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread(() -> { + byte[] bases = referenceSource.getReferenceBases(record, false); + Assert.assertNotNull(bases); + }); + threads[i].start(); + } + + for (Thread thread : threads) { + thread.join(); + } + } + + @Test + public void testLoadFromRefbasesFile() throws Exception { + String md5 = "refbases_md5"; + byte[] seq = "ACGTREF".getBytes(); + this.createRefbasesFile(md5, seq); + + referenceSource = new EBIReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", seq.length).setMd5(md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertArrayEquals(seq, bases); + Assert.assertEquals(1, referenceSource.getRefbasesFiles().size()); + } + + @Test + public void testDownloadFromEBIReal() throws Exception { + String md5 = "d2ed829b8a1628d16cbeee88e88e39eb"; // hg19 chrM + + referenceSource = new EBIReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chrM").setMd5(md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertEquals(16571, bases.length); + + // The download should have persisted a refbases file + Assert.assertEquals(1, referenceSource.getRefbasesFiles().size()); + Path stored = referenceSource.getRefbasesFiles().iterator().next(); + Assert.assertEquals("md5_" + md5 + ".txt", stored.getFileName().toString()); + Assert.assertTrue(Files.exists(stored)); + Assert.assertArrayEquals(bases, Files.readAllBytes(stored)); + } + + /* + @Test + public void testDownloadFromEBIFake() throws Exception { + String md5 = "download_md5"; + // Start simple HTTP server that returns "SEQUENCE_" for path / + com.sun.net.httpserver.HttpServer server = com.sun.net.httpserver.HttpServer.create(new java.net.InetSocketAddress(0), 0); + server.createContext("/", exchange -> { + try { + String path = exchange.getRequestURI().getPath(); + String req = path.startsWith("/") && path.length() > 1 ? path.substring(1) : ""; + byte[] resp = ("SEQUENCE_" + req).getBytes(java.nio.charset.StandardCharsets.US_ASCII); + exchange.sendResponseHeaders(200, resp.length); + try (var os = exchange.getResponseBody()) { + os.write(resp); + } + } finally { + exchange.close(); + } + }); + server.start(); + + // Save and override Defaults for the test + String oldMask = htsjdk.samtools.Defaults.EBI_REFERENCE_SERVICE_URL_MASK; + boolean oldUse = htsjdk.samtools.Defaults.USE_CRAM_REF_DOWNLOAD; + try { + int port = server.getAddress().getPort(); + htsjdk.samtools.Defaults.EBI_REFERENCE_SERVICE_URL_MASK = "http://localhost:" + port + "/%s"; + htsjdk.samtools.Defaults.USE_CRAM_REF_DOWNLOAD = true; + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 100).setMd5(md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertEquals("SEQUENCE_" + md5, new String(bases, java.nio.charset.StandardCharsets.US_ASCII)); + + // The download should have persisted a refbases file + Assert.assertEquals(1, referenceSource.getRefbasesFiles().size()); + Path stored = referenceSource.getRefbasesFiles().iterator().next(); + Assert.assertTrue(Files.exists(stored)); + Assert.assertArrayEquals(bases, Files.readAllBytes(stored)); + } finally { + // restore defaults and stop server + htsjdk.samtools.Defaults.EBI_REFERENCE_SERVICE_URL_MASK = oldMask; + htsjdk.samtools.Defaults.USE_CRAM_REF_DOWNLOAD = oldUse; + server.stop(0); + } + } + + */ + + /** + * Helper method to create a FASTA file with corresponding .dict and .fai files. + * Note: This is a simplified version. In a real implementation, you would need + * to create proper FASTA index files using samtools or similar tools. + */ + private File createFastaFileWithIndexes(String fileName, String sequenceName, + String sequence, String md5) throws IOException { + // Create FASTA file + File fastaFile = new File(testFolder, fileName); + var header = ""; + try (FileWriter writer = new FileWriter(fastaFile)) { + header = ">" + sequenceName; + if (md5 != null && !md5.isEmpty()) { + header += " MD5:" + md5; + } + header += "\n"; + writer.write(header); + + // Write sequence in 60 char lines (FASTA convention) + int lineLength = 60; + for (int i = 0; i < sequence.length(); i += lineLength) { + int end = Math.min(i + lineLength, sequence.length()); + writer.write(sequence.substring(i, end)); + writer.write("\n"); + } + } + + // Create .dict file (simplified - real dict files have specific format) + String baseName = fileName.substring(0, fileName.lastIndexOf('.')); + File dictFile = new File(testFolder, baseName + ".dict"); + try (FileWriter writer = new FileWriter(dictFile)) { + writer.write("@HD\tVN:1.0\n"); + writer.write("@SQ\tSN:" + sequenceName + "\tLN:" + sequence.length()); + if (md5 != null && !md5.isEmpty()) { + writer.write("\tM5:" + md5); + } + writer.write("\n"); + } + + // Create .fai file (FASTA index - simplified) + File faiFile = new File(testFolder, fileName + ".fai"); + try (FileWriter writer = new FileWriter(faiFile)) { + // Format: sequence_name, length, offset, linebases, linewidth + // This is simplified - real .fai files need proper calculation + writer.write(sequenceName + "\t" + sequence.length() + "\t" + header.length() + "\t60\t61\n"); + } + + return fastaFile; + } + + private File createRefbasesFile(String md5, byte[] bytes) throws IOException { + Path refbases = testFolder.toPath().resolve("md5_" + md5 + ".txt"); + Files.write(refbases, bytes); + return refbases.toFile(); + } + + /** + * Helper method to calculate MD5 hash of a string. + * This is a simplified version - in reality, MD5 should be calculated + * from the actual sequence bytes in the same way htsjdk does it. + */ + private String calculateMD5(String sequence) { + try { + java.security.MessageDigest md = java.security.MessageDigest.getInstance("MD5"); + byte[] digest = md.digest(sequence.getBytes()); + StringBuilder sb = new StringBuilder(); + for (byte b : digest) { + sb.append(String.format("%02x", b)); + } + return sb.toString(); + } catch (Exception e) { + return "test_md5_hash"; + } + } +} diff --git a/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestFolderReferenceSource.java b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestFolderReferenceSource.java new file mode 100644 index 000000000..d48a24675 --- /dev/null +++ b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestFolderReferenceSource.java @@ -0,0 +1,351 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceRecord; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.exceptions.GorResourceException; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Tests for FolderReferenceSource class. + * Tests folder scanning, MD5 mapping, and reference base retrieval. + */ +public class UTestFolderReferenceSource { + + @Rule + public TemporaryFolder workDir = new TemporaryFolder(); + + private File testFolder; + private FolderReferenceSource referenceSource; + + @Before + public void setUp() throws IOException { + testFolder = workDir.newFolder("ref_folder"); + } + + @After + public void tearDown() throws IOException { + if (referenceSource != null) { + referenceSource.close(); + } + } + + @Test + public void testConstructorWithEmptyFolder() throws IOException { + // Test with empty folder + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Should not throw exception, but map should be empty + Assert.assertNotNull(referenceSource); + Assert.assertEquals(0, referenceSource.getReferenceFiles().size()); + } + + @Test(expected = GorResourceException.class) + public void testConstructorWithNonExistentFolder() { + // Test with non-existent folder + referenceSource = new FolderReferenceSource("/non/existent/path"); + } + + @Test + public void testScanFolderWithValidFastaFiles() throws IOException { + // Create a valid FASTA file with index files + File fastaFile = createFastaFileWithIndexes("test_ref.fasta", "chr1", "ACGTACGT", "a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Verify that the FASTA file was scanned + Assert.assertNotNull(referenceSource); + Assert.assertEquals(1, referenceSource.getReferenceFiles().size()); + } + + @Test(expected = GorResourceException.class) + public void testScanFolderIgnoresFastaWithoutIndexFiles() throws IOException { + // Create FASTA file without index files + File fastaFile = new File(testFolder, "no_index.fasta"); + try (FileWriter writer = new FileWriter(fastaFile)) { + writer.write(">chr1\nACGTACGT\n"); + } + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + } + + @Test + public void testScanFolderWithMultipleFastaFiles() throws IOException { + // Create multiple FASTA files + createFastaFileWithIndexes("ref1.fasta", "chr1", "ACGT", "md5_1"); + createFastaFileWithIndexes("ref2.fasta", "chr2", "TGCA", "md5_2"); + createFastaFileWithIndexes("ref3.fa", "chr3", "GCTA", "md5_3"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + Assert.assertNotNull(referenceSource); + Assert.assertEquals(3, referenceSource.getReferenceFiles().size()); + Assert.assertArrayEquals("TGCA".getBytes(), referenceSource.getReferenceBases( + new SAMSequenceRecord("chr2").setMd5("md5_2"), false)); + + } + + @Test + public void testGetReferenceBasesWithValidMD5() throws IOException { + // Create FASTA file and get its MD5 + String sequenceName = "chr1"; + String sequence = "ACGTACGTACGTACGT"; + String md5 = calculateMD5(sequence); + + File fastaFile = createFastaFileWithIndexes("test.fasta", sequenceName, sequence, md5); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Create a SAMSequenceRecord with the MD5 + SAMSequenceRecord record = new SAMSequenceRecord(sequenceName, sequence.length()).setMd5(md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertEquals(sequence.length(), bases.length); + Assert.assertEquals("ACGTACGTACGTACGT", new String(bases)); + } + + @Test + public void testGetReferenceBasesWithInvalidMD5() throws IOException { + // Create FASTA file + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "valid_md5"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Create a SAMSequenceRecord with different MD5 + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 4); + record.setAttribute("M5", "invalid_md5"); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + // Should return empty or fallback to EBI + // The behavior depends on implementation + Assert.assertNull(bases); + } + + @Test(expected = GorDataException.class) + public void testGetReferenceBasesWithNullMD5() throws IOException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 4); + // No MD5 set + + byte[] bases = referenceSource.getReferenceBases(record, false); + } + + @Test + public void testGetReferenceBasesCaseInsensitive() throws IOException { + String sequence = "acgtacgt"; // lowercase + String md5 = calculateMD5(sequence.toUpperCase()); + + File fastaFile = createFastaFileWithIndexes("test.fasta", "chr1", sequence, md5); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", sequence.length()); + record.setAttribute("M5", md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + // Bases should be converted to uppercase + Assert.assertNotNull(bases); + String basesString = new String(bases); + Assert.assertTrue(basesString.equals(basesString.toUpperCase())); + } + + @Test + public void testCloseReleasesResources() throws IOException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Close should not throw exception + referenceSource.close(); + } + + @Test + public void testMultipleCloseCalls() throws IOException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Multiple close calls should be safe + referenceSource.close(); + referenceSource.close(); + } + + @Test(expected = GorResourceException.class) + public void testScanFolderWithCorruptedFastaFile() throws IOException { + // Create a FASTA file that exists but is corrupted + File fastaFile = new File(testFolder, "corrupted.fasta"); + try (FileWriter writer = new FileWriter(fastaFile)) { + writer.write("This is not a valid FASTA file\n"); + } + + // Create index files so it gets picked up + File dictFile = new File(testFolder, "corrupted.dict"); + dictFile.createNewFile(); + File faiFile = new File(testFolder, "corrupted.fai"); + faiFile.createNewFile(); + + // Should handle corrupted files gracefully + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + } + + @Test + public void testScanFolderWithSubdirectories() throws IOException { + // Create subdirectory with FASTA file + File subDir = new File(testFolder, "subdir"); + subDir.mkdirs(); + + // Only files in the root folder should be scanned + createFastaFileWithIndexes("root.fasta", "chr1", "ACGT", "md5_1"); + + File subFasta = new File(subDir, "sub.fasta"); + try (FileWriter writer = new FileWriter(subFasta)) { + writer.write(">chr1\nACGT\n"); + } + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Only root.fasta should be found + Assert.assertNotNull(referenceSource); + Assert.assertEquals(1, referenceSource.getReferenceFiles().size()); + } + + @Test + public void testGetReferenceBasesWithLongSequence() throws IOException { + // Create a longer sequence + StringBuilder longSeq = new StringBuilder(); + for (int i = 0; i < 1000; i++) { + longSeq.append("ACGT"); + } + + String sequence = longSeq.toString(); + String md5 = calculateMD5(sequence); + + createFastaFileWithIndexes("long.fasta", "chr1", sequence, md5); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", sequence.length()); + record.setAttribute("M5", md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertEquals(sequence.length(), bases.length); + } + + @Test + public void testConcurrentAccess() throws IOException, InterruptedException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 4); + record.setAttribute("M5", "md5_hash"); + + // Test concurrent access + Thread[] threads = new Thread[10]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread(() -> { + byte[] bases = referenceSource.getReferenceBases(record, false); + Assert.assertNotNull(bases); + }); + threads[i].start(); + } + + for (Thread thread : threads) { + thread.join(); + } + } + + /** + * Helper method to create a FASTA file with corresponding .dict and .fai files. + * Note: This is a simplified version. In a real implementation, you would need + * to create proper FASTA index files using samtools or similar tools. + */ + private File createFastaFileWithIndexes(String fileName, String sequenceName, + String sequence, String md5) throws IOException { + // Create FASTA file + File fastaFile = new File(testFolder, fileName); + var header = ""; + try (FileWriter writer = new FileWriter(fastaFile)) { + header = ">" + sequenceName; + if (md5 != null && !md5.isEmpty()) { + header += " MD5:" + md5; + } + header += "\n"; + writer.write(header); + + // Write sequence in 60 char lines (FASTA convention) + int lineLength = 60; + for (int i = 0; i < sequence.length(); i += lineLength) { + int end = Math.min(i + lineLength, sequence.length()); + writer.write(sequence.substring(i, end)); + writer.write("\n"); + } + } + + // Create .dict file (simplified - real dict files have specific format) + String baseName = fileName.substring(0, fileName.lastIndexOf('.')); + File dictFile = new File(testFolder, baseName + ".dict"); + try (FileWriter writer = new FileWriter(dictFile)) { + writer.write("@HD\tVN:1.0\n"); + writer.write("@SQ\tSN:" + sequenceName + "\tLN:" + sequence.length()); + if (md5 != null && !md5.isEmpty()) { + writer.write("\tM5:" + md5); + } + writer.write("\n"); + } + + // Create .fai file (FASTA index - simplified) + File faiFile = new File(testFolder, fileName + ".fai"); + try (FileWriter writer = new FileWriter(faiFile)) { + // Format: sequence_name, length, offset, linebases, linewidth + // This is simplified - real .fai files need proper calculation + writer.write(sequenceName + "\t" + sequence.length() + "\t" + header.length() + "\t60\t61\n"); + } + + return fastaFile; + } + + private File createRefbasesFile(String md5, byte[] bytes) throws IOException { + Path refbases = testFolder.toPath().resolve("md5_" + md5 + ".txt"); + Files.write(refbases, bytes); + return refbases.toFile(); + } + + /** + * Helper method to calculate MD5 hash of a string. + * This is a simplified version - in reality, MD5 should be calculated + * from the actual sequence bytes in the same way htsjdk does it. + */ + private String calculateMD5(String sequence) { + try { + java.security.MessageDigest md = java.security.MessageDigest.getInstance("MD5"); + byte[] digest = md.digest(sequence.getBytes()); + StringBuilder sb = new StringBuilder(); + for (byte b : digest) { + sb.append(String.format("%02x", b)); + } + return sb.toString(); + } catch (Exception e) { + return "test_md5_hash"; + } + } +} diff --git a/test/src/main/java/gorsat/TestUtils.java b/test/src/main/java/gorsat/TestUtils.java index 90904035c..dc1dfadec 100644 --- a/test/src/main/java/gorsat/TestUtils.java +++ b/test/src/main/java/gorsat/TestUtils.java @@ -304,6 +304,8 @@ public static PipeInstance createPipeInstance(boolean server, String gorroot, St options.add("-cachedir"); options.add(cacheDir); } +// options.add("-config"); +// options.add("../tests/data/ref_mini/gor_config.txt"); return PipeInstance.createGorIterator(new GorContext(createSession(options.toArray(String[]::new), null, server, securityContext, writeLocations))); }