From 4ebe7b831f436c8702acb0ff9a2ae88675ec797f Mon Sep 17 00:00:00 2001 From: Gisli Magnusson Date: Tue, 3 Feb 2026 14:22:36 +0000 Subject: [PATCH 1/7] feat(ENGKNOW-3046): Enable multiple cram reference files. --- gortools/src/test/java/gorsat/UTestCram.java | 47 ++- .../stream/datatypes/cram/CramIterator.java | 94 +++-- .../reference/CompositeReferenceSource.java | 55 +++ .../cram/reference/EBIReferenceSource.java | 155 ++++++++ .../cram/reference/FolderReferenceSource.java | 125 +++++++ .../reference/MD5CachedReferenceSource.java | 88 +++++ .../SharedCachedReferenceSource.java | 2 +- .../SharedChromSeqReferenceSource.java | 2 +- .../SharedFastaReferenceSource.java | 2 +- .../org/gorpipe/gor/table/util/PathUtils.java | 9 + .../UTestCompositeReferenceSource.java | 181 +++++++++ .../reference/UTestEBIReferenceSource.java | 245 ++++++++++++ .../reference/UTestFolderReferenceSource.java | 351 ++++++++++++++++++ test/src/main/java/gorsat/TestUtils.java | 2 + 14 files changed, 1290 insertions(+), 68 deletions(-) create mode 100644 model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/CompositeReferenceSource.java create mode 100644 model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java create mode 100644 model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java create mode 100644 model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java rename model/src/main/java/org/gorpipe/gor/{model => driver/providers/stream/datatypes/cram/reference}/SharedCachedReferenceSource.java (98%) rename model/src/main/java/org/gorpipe/gor/{model => driver/providers/stream/datatypes/cram/reference}/SharedChromSeqReferenceSource.java (97%) rename model/src/main/java/org/gorpipe/gor/{model => driver/providers/stream/datatypes/cram/reference}/SharedFastaReferenceSource.java (98%) create mode 100644 model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestCompositeReferenceSource.java create mode 100644 model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestEBIReferenceSource.java create mode 100644 model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestFolderReferenceSource.java diff --git a/gortools/src/test/java/gorsat/UTestCram.java b/gortools/src/test/java/gorsat/UTestCram.java index 7d32d0cb0..9b1d20bc9 100644 --- a/gortools/src/test/java/gorsat/UTestCram.java +++ b/gortools/src/test/java/gorsat/UTestCram.java @@ -31,12 +31,17 @@ import org.junit.Assert; import org.junit.Rule; import org.junit.Test; +import org.junit.contrib.java.lang.system.RestoreSystemProperties; import org.junit.rules.TemporaryFolder; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Paths; +import java.util.List; + +import static gorsat.TestUtils.LINE_SPLIT_PATTERN; +import static org.gorpipe.gor.driver.providers.stream.datatypes.cram.CramIterator.KEY_REFERENCE_FORCE_FOLDER; public class UTestCram { @@ -45,6 +50,9 @@ public class UTestCram { @Rule public TemporaryFolder workDir = new TemporaryFolder(); + @Rule + public RestoreSystemProperties restoreSystemProperties = new RestoreSystemProperties(); + public static File createWrongConfigFile(File directory) throws IOException { return FileTestUtils.createTempFile(directory, "generic.gor", "buildPath\t../tests/data/ref_mini/chromSeq\n" + @@ -106,7 +114,7 @@ public void readCramWithFastaReferenceFromConfig() { public void readCramWithFastaReferenceFromConfigException() throws IOException { File wrongConfigFile = createWrongConfigFile(workDir.getRoot()); System.clearProperty("gor.driver.cram.fastareferencesource"); - String[] args = new String[]{ + String[] args = new String[] { "gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM), "-config", wrongConfigFile.getCanonicalPath()}; @@ -120,24 +128,29 @@ public void readCramWithFastaReferenceFromConfigException() throws IOException { @Test public void readCramWithFastaReferenceAndGenerateMissingAttributes() { - try { - System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA)); - System.setProperty("gor.driver.cram.generatemissingattributes", "false"); - String[] linesWithoutMissingAttributes = TestUtils.runGorPipeLines("gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)); - System.setProperty("gor.driver.cram.generatemissingattributes", "true"); - String[] linesWithMissingAttributes = TestUtils.runGorPipeLines("gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)); - - Assert.assertEquals(8, linesWithoutMissingAttributes.length); - Assert.assertEquals(8, linesWithMissingAttributes.length); - // See if we have the missing entry in the last column. - Assert.assertFalse(linesWithoutMissingAttributes[1].contains("NM=")); - Assert.assertTrue(linesWithMissingAttributes[1].contains("NM=")); + System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA)); + System.setProperty(KEY_REFERENCE_FORCE_FOLDER, "false"); - } finally { - System.clearProperty("gor.driver.cram.fastareferencesource"); - System.clearProperty("gor.driver.cram.generatemissingattributes"); - } + String[] args = new String[] {"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)}; + + System.setProperty("gor.driver.cram.generatemissingattributes", "false"); + String[] linesWithoutMissingAttributes = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN); + + System.setProperty("gor.driver.cram.generatemissingattributes", "true"); + String[] linesWithMissingAttributesCramRef = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN); + + args = new String[] { + "gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM) + , "-config", "../tests/data/ref_mini/gor_config.txt"}; + String[] linesWithMissingAttributesProjectRef = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN); + Assert.assertEquals(8, linesWithoutMissingAttributes.length); + Assert.assertEquals(8, linesWithMissingAttributesCramRef.length); + Assert.assertEquals(8, linesWithMissingAttributesProjectRef.length); + // See if we have the missing entry in the last column. + Assert.assertFalse(linesWithoutMissingAttributes[1].contains("NM=")); + Assert.assertTrue(linesWithMissingAttributesCramRef[1].contains("NM=")); + Assert.assertTrue(linesWithMissingAttributesProjectRef[1].contains("NM=")); } @Test(expected = GorResourceException.class) diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java index b19c8bdcb..fe31c7a7f 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java @@ -32,17 +32,19 @@ import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; import org.gorpipe.exceptions.GorResourceException; -import org.gorpipe.gor.driver.meta.DataType; import org.gorpipe.gor.driver.providers.stream.datatypes.bam.BamIterator; +import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.CompositeReferenceSource; +import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.EBIReferenceSource; +import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.FolderReferenceSource; +import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.SharedFastaReferenceSource; import org.gorpipe.gor.model.ChromoLookup; import org.gorpipe.gor.session.GorSession; import org.gorpipe.gor.driver.adapters.StreamSourceSeekableStream; import org.gorpipe.gor.driver.providers.stream.sources.StreamSource; import org.gorpipe.gor.table.util.PathUtils; -import org.gorpipe.gor.util.DataUtil; -import org.gorpipe.gor.util.StringUtil; import org.gorpipe.gor.model.Row; -import org.gorpipe.gor.model.SharedFastaReferenceSource; +import org.gorpipe.model.gor.iterators.RefSeq; +import org.gorpipe.util.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -66,8 +68,9 @@ */ public class CramIterator extends BamIterator { - private final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes"; - private final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource"; + public final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes"; + public final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource"; + public final static String KEY_REFERENCE_FORCE_FOLDER = "gor.driver.cram.reference.force.folder."; private static final Logger log = LoggerFactory.getLogger(CramIterator.class); @@ -75,10 +78,11 @@ public class CramIterator extends BamIterator { private int[] columns; ChromoLookup lookup; private String fileName; - private String cramReferencePath = ""; - private CRAMFileReader cramFileReader; - private ReferenceSequenceFile referenceSequenceFile; + private String projectCramReferencePath; // Cram reference path from project context. + private RefSeq projectRefSeq; // RefSeq from project context, used for MD tag calculation. + private ReferenceSequenceFile referenceSequenceFile; // Handle to cram reference file, fallback for MD tag calculation. private CRAMReferenceSource referenceSource; + private CRAMFileReader cramFileReader; private boolean generateMissingCramAttributes; /** @@ -94,35 +98,6 @@ public CramIterator(ChromoLookup lookup, CramFile cramFile, int[] columns) { this.lookup = lookup; } - - /** - * Construct a CramIterator - * - * @param lookup The lookup service for chromosome name to ids - * @param file The CRAM File to iterate through - */ - public CramIterator(ChromoLookup lookup, String file, String index, String reference, boolean generateMissingAttributes) { - - fileName = file; - generateMissingCramAttributes = generateMissingAttributes; - File cramfile = new File(file); - File cramindex = new File(index); - if (!cramindex.exists()) { - cramindex = new File(DataUtil.toFile(file, DataType.CRAI)); - } - - referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(reference)); - referenceSource = createReferenceSource(fileName, ""); - - try { - cramFileReader = new CRAMFileReader(cramfile, new FileInputStream(cramindex), referenceSource); - } catch (FileNotFoundException e) { - throw new GorResourceException("Cram file not found.", file, e); - } - SamReader samreader = new SamReader.PrimitiveSamReaderToSamReaderAdapter(cramFileReader, null); - init(lookup, samreader, true); - } - @Override public Row next() { Row row = super.next(); @@ -135,8 +110,16 @@ public Row next() { boolean calculateNM = record.getIntegerAttribute(SAMTag.NM.name()) == null; if (calculateMD) { - byte[] referenceBytes = referenceSequenceFile.getSubsequenceAt(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBases(); - CramUtils.calculateMdAndNmTags(record, referenceBytes, calculateMD, calculateNM); + byte[] referenceBytes = null; + if (projectRefSeq != null) { + referenceBytes = projectRefSeq.getBases(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBytes(); + } else if (referenceSequenceFile != null) { + // Fallback to the reference file used by the CRAM reader + referenceBytes = referenceSequenceFile.getSubsequenceAt(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBases(); + } + if (referenceBytes != null) { + CramUtils.calculateMdAndNmTags(record, referenceBytes, calculateMD, calculateNM); + } } else if (calculateNM) { SequenceUtil.calculateSamNmTagFromCigar(record); } @@ -170,7 +153,10 @@ public void init(GorSession session) { return; } - cramReferencePath = session.getProjectContext().getReferenceBuild().getCramReferencePath(); + projectCramReferencePath = session.getProjectContext().getReferenceBuild().getCramReferencePath(); + if (!Strings.isNullOrEmpty(session.getProjectContext().getGorConfigFile())) { + projectRefSeq = session.getProjectContext().createRefSeq(); + } if (cramFile != null) { // I read this property here through System.getProperty as there is no other way to pass properties to the driver @@ -237,7 +223,7 @@ private CRAMReferenceSource createReferenceSource(String ref, String root) { } // This reference should be fasta but we let the htsjdk library decide - return createFileReference(file.toString()); + return createFileReference(file); } private File getReferenceFromGorOptions(File file) { @@ -252,8 +238,8 @@ private File getReferenceFromGorOptions(File file) { } private File getReferenceFromGorConfig(File file, String root) { - if (!file.exists() && !StringUtil.isEmpty(cramReferencePath)) { - return PathUtils.resolve(Paths.get(root), Paths.get(cramReferencePath)).toFile(); + if (!file.exists() && !Strings.isNullOrEmpty(projectCramReferencePath)) { + return PathUtils.resolve(Paths.get(root), Paths.get(projectCramReferencePath)).toFile(); } return file; } @@ -277,10 +263,22 @@ private File getReferenceFromReferenceLinkFile(File file) { return file; } - private CRAMReferenceSource createFileReference(String ref) { - String referenceKey = FilenameUtils.removeExtension(FilenameUtils.getBaseName(ref)); - referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(ref)); - return new SharedFastaReferenceSource(referenceSequenceFile, referenceKey); + private CRAMReferenceSource createFileReference(File refFile) { + if (refFile.isDirectory()) { + return new CompositeReferenceSource(List.of( + new FolderReferenceSource(refFile.getPath()), + new EBIReferenceSource(refFile.getPath()))); + } else if (Boolean.getBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) { + return new CompositeReferenceSource(List.of( + new FolderReferenceSource(refFile.getParent()), + new EBIReferenceSource(refFile.getParent()))); + } else { + referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); + + String referenceKey = FilenameUtils.removeExtension(refFile.getName()); + var referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); + return new SharedFastaReferenceSource(referenceFile, referenceKey); + } } } diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/CompositeReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/CompositeReferenceSource.java new file mode 100644 index 000000000..963656f78 --- /dev/null +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/CompositeReferenceSource.java @@ -0,0 +1,55 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.cram.ref.CRAMReferenceSource; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Composite reference source, tries out different reference source in order. + */ +public class CompositeReferenceSource implements CRAMReferenceSource, Closeable { + + List sources; + + public CompositeReferenceSource(List sources) { + this.sources = sources != null ? new ArrayList<>(sources) : new ArrayList<>(); + } + + @Override + public byte[] getReferenceBases(SAMSequenceRecord sequenceRecord, boolean tryNameVariants) { + byte[] bytes = null; + for (var source : sources) { + bytes = source.getReferenceBases(sequenceRecord, tryNameVariants); + if (bytes != null) { + return bytes; + } + } + return bytes; + } + + @Override + public byte[] getReferenceBasesByRegion(SAMSequenceRecord sequenceRecord, int zeroBasedStart, int requestedRegionLength) { + byte[] bytes = null; + for (var source : sources) { + bytes = source.getReferenceBasesByRegion(sequenceRecord, zeroBasedStart, requestedRegionLength); + if (bytes != null) { + return bytes; + } + } + return bytes; + } + + @Override + public void close() throws IOException { + for (var source : sources) { + if (source instanceof Closeable) { + ((Closeable) source).close(); + } + } + sources.clear(); + } +} diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java new file mode 100644 index 000000000..d12dbfac7 --- /dev/null +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java @@ -0,0 +1,155 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.Defaults; +import htsjdk.samtools.SAMSequenceRecord; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.exceptions.GorResourceException; +import org.gorpipe.gor.table.util.PathUtils; +import org.gorpipe.util.Strings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Get reference sources from the EBI service. Optionally caching them locally for future reference. + * The downloaded references sequences are stored in the provided folder as md5_.txt files. + */ +public class EBIReferenceSource extends MD5CachedReferenceSource { + private static final Logger log = LoggerFactory.getLogger(EBIReferenceSource.class); + + public static final String KEY_USE_CRAM_REF_DOWNLOAD = "gor.driver.cram.ref.download"; + + private static final String REFBASES_PREFIX = "md5_"; + private static final String REFBASES_EXT = ".txt"; + + protected static Map md5ToRefbases = new ConcurrentHashMap<>(); + + private Path referenceFolder; // If null we do not download. + + public EBIReferenceSource() { + } + + public EBIReferenceSource(String referenceFolder) { + if (!Strings.isNullOrEmpty(referenceFolder)) { + this.referenceFolder = Path.of(referenceFolder); + + if (!Files.isDirectory(this.referenceFolder)) { + throw new GorResourceException("Can not create FolderReferenceSource %s as the target is not a folder or does not exists".formatted(referenceFolder), referenceFolder); + } + scanReferenceFolder(); + } + } + + Set getRefbasesFiles() { + return new HashSet<>(md5ToRefbases.values()); + } + + private void scanReferenceFolder() { + md5ToRefbases.clear(); + + if (referenceFolder == null) return; + + try { + for (var p : Files.list(referenceFolder).filter(Files::isRegularFile).toList()) { + var f = p.getFileName().toString().toLowerCase(); + if (f.startsWith(REFBASES_PREFIX) && f.endsWith(REFBASES_EXT)) { + processRefbasesFile(referenceFolder.resolve(f)); + } + } + } catch (IOException e) { + log.warn("Failed scanning reference folder {}", referenceFolder, e); + } + } + + private void processRefbasesFile(Path refbases) { + String fileName = refbases.getFileName().toString(); + if (!fileName.startsWith(REFBASES_PREFIX) || !fileName.endsWith(REFBASES_EXT)) return; + + String md5 = fileName.substring(REFBASES_PREFIX.length(), fileName.length() - REFBASES_EXT.length()); + md5ToRefbases.put(md5, refbases); + } + + @Override + protected byte[] loadReference(final SAMSequenceRecord record) { + // Load from refbases file. + Path refbasesPath = md5ToRefbases.get(record.getMd5()); + if (refbasesPath != null) { + try { + byte[] bases = Files.readAllBytes(refbasesPath); + if (bases == null || bases.length == 0) { + throw new GorDataException("Reference sequence in " + refbasesPath + " is empty"); + } + return bases; + } catch (IOException e) { + throw new GorDataException("Could not read refbases file %s".formatted(refbasesPath), e); + } + } + + // Load from EBI service. + if (Boolean.parseBoolean(System.getProperty(KEY_USE_CRAM_REF_DOWNLOAD, + Boolean.toString(Defaults.USE_CRAM_REF_DOWNLOAD)))) { + try { + + byte[] bases = downloadFromEBI(record.getMd5()); + if (bases != EMPTY_BASES) { + saveRefbasesToDisk(record.getMd5(), bases); + } + return bases; + } catch (IOException e) { + log.warn("Could not download/save reference sequence for md5 " + record.getMd5(), e); + } + } + + return EMPTY_BASES; + } + + /** + * Download reference sequence from EBI by MD5 and store it in the reference folder. + * @param md5 + * @return bytes of the reference sequence, null if not found. + * @throws IOException + */ + private byte[] downloadFromEBI(String md5) throws IOException { + log.info("Downloading reference {} from ENA", md5); + URL url = new URL(String.format(Defaults.EBI_REFERENCE_SERVICE_URL_MASK, md5)); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setConnectTimeout(15000); + conn.setReadTimeout(30000); + conn.setRequestMethod("GET"); + + if (conn.getResponseCode() != 200) { + log.warn("ENA returned {} for {}", conn.getResponseCode(), md5); + return EMPTY_BASES; + } + + byte[] bases; + try (BufferedInputStream in = new BufferedInputStream(conn.getInputStream())) { + bases = in.readAllBytes(); + } + if (bases.length == 0) return EMPTY_BASES; + + return bases; + } + + private void saveRefbasesToDisk(String md5, byte[] bases) throws IOException { + if (referenceFolder == null) return; + + Path basesPath = referenceFolder.resolve(REFBASES_PREFIX + md5 + REFBASES_EXT); + Path tempBasesPath = PathUtils.getTempFilePath(basesPath); + + Files.write(tempBasesPath, bases); + Files.move(tempBasesPath, basesPath); + + md5ToRefbases.put(md5, basesPath); + } +} diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java new file mode 100644 index 000000000..127b3ebd3 --- /dev/null +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java @@ -0,0 +1,125 @@ +/* + * BEGIN_COPYRIGHT + * + * Copyright (C) 2011-2013 deCODE genetics Inc. + * Copyright (C) 2013-2019 WuXi NextCode Inc. + * All Rights Reserved. + * + * GORpipe is free software: you can redistribute it and/or modify + * it under the terms of the AFFERO GNU General Public License as published by + * the Free Software Foundation. + * + * GORpipe is distributed "AS-IS" AND WITHOUT ANY WARRANTY OF ANY KIND, + * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, + * NON-INFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. See + * the AFFERO GNU General Public License for the complete license terms. + * + * You should have received a copy of the AFFERO GNU General Public License + * along with GORpipe. If not, see + * + * END_COPYRIGHT + */ + +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileFactory; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.exceptions.GorResourceException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Reference source that resolves references by MD5 using a folder of FASTA files (with .dict and .fai) + * + */ + +public class FolderReferenceSource extends MD5CachedReferenceSource { + + private static final Logger log = LoggerFactory.getLogger(FolderReferenceSource.class); + + private static final Set FASTA_EXT = Set.of("fa", "fasta"); + + private record Md5ReferencePath(String md5, Path path, String contig) {} + protected static Map md5ToReferencePath = new ConcurrentHashMap<>(); + + private Path referenceFolder; + + private final Map refFileByPath = new ConcurrentHashMap<>(); + + public FolderReferenceSource(String referenceFolder) { + this.referenceFolder = Path.of(referenceFolder); + if (!Files.isDirectory(this.referenceFolder)) { + throw new GorResourceException("Can not create FolderReferenceSource %s as the target is not a folder or does not exists".formatted(referenceFolder), referenceFolder); + } + scanReferenceFolder(); + } + + @Override + protected byte[] loadReference(final SAMSequenceRecord record) { + // Load form fasta file. + Md5ReferencePath referencePath = md5ToReferencePath.get(record.getMd5()); + if (referencePath != null) { + ReferenceSequenceFile rsFile = refFileByPath.computeIfAbsent(referencePath.path(), ReferenceSequenceFileFactory::getReferenceSequenceFile); + if (rsFile == null) { + throw new GorDataException(String.format("Reference file %s for md5 %s not found in reference folder %s", + referencePath, referencePath.md5(), referenceFolder)); + } + + return rsFile.getSequence(referencePath.contig()).getBases(); + } + + return EMPTY_BASES; + } + + private void scanReferenceFolder() { + md5ToReferencePath.clear(); + + try { + for (var p : Files.list(referenceFolder).filter(Files::isRegularFile).toList()) { + var f = p.getFileName().toString().toLowerCase(); + if (FASTA_EXT.stream().anyMatch(ext -> f.endsWith("." + ext))) { + processFasta(referenceFolder.resolve(f)); + } + } + } catch (IOException e) { + log.warn("Failed scanning reference folder {}", referenceFolder, e); + } + } + + private void processFasta(Path fastaFile) { + ReferenceSequenceFile refFile = refFileByPath.computeIfAbsent(fastaFile, ReferenceSequenceFileFactory::getReferenceSequenceFile); + SAMSequenceDictionary dictionary = refFile.getSequenceDictionary(); + if (dictionary == null) { + throw new GorResourceException("Fasta file %s is invalid cram reference as it is missing dict file".formatted(fastaFile), fastaFile.toString()); + } + for (SAMSequenceRecord rec : dictionary.getSequences()) { + String md5 = rec.getMd5(); + if (md5 == null || md5.isEmpty()) continue; + md5ToReferencePath.put(md5, new FolderReferenceSource.Md5ReferencePath(md5, fastaFile, rec.getContig())); + } + } + + @Override + public void close() { + refFileByPath.values().forEach(r -> { + try { + r.close(); + } catch (IOException ignored) { + } + }); + refFileByPath.clear(); + } + + Set getReferenceFiles() { + return new HashSet<>(refFileByPath.keySet()); + } +} diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java new file mode 100644 index 000000000..ccd6c6180 --- /dev/null +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java @@ -0,0 +1,88 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.RemovalListener; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.cram.ref.CRAMReferenceSource; +import htsjdk.samtools.util.StringUtil; +import htsjdk.utils.ValidationUtils; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.util.Strings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; + +public abstract class MD5CachedReferenceSource implements CRAMReferenceSource, Closeable { + private static final Logger log = LoggerFactory.getLogger(MD5CachedReferenceSource.class); + + public static final byte[] EMPTY_BASES = new byte[0]; + + private static final Cache md5BasesCache = createMd5BasesCache(); + + private static Cache createMd5BasesCache () { + RemovalListener removalNotifier; + removalNotifier = notification -> log.debug("Removing from cache, key: {}, cause: {}", notification.getKey(), notification.getCause()); + + // TODO: Move this to owner config or context object when the dirver model supports that + Integer cacheTimout = Integer.parseInt(System.getProperty("gor.driver.cram.referencetimeout", "60")); // Seconds + + return CacheBuilder.newBuilder().removalListener(removalNotifier) + .expireAfterAccess(cacheTimout, TimeUnit.SECONDS).build(); + } + + @Override + public synchronized byte[] getReferenceBases(final SAMSequenceRecord record, + final boolean tryNameVariants) { + // check cache by sequence name: + final String md5 = record.getMd5(); + + if (Strings.isNullOrEmpty(md5)) { + throw new GorDataException("Can not load reference bases as SAMSequenceRecord does not contain MD5"); + } + + try { + var bases = md5BasesCache.get(md5, () -> loadReference(record)); + StringUtil.toUpperCase(bases); + return bases != EMPTY_BASES ? bases : null; + }catch (ExecutionException e) { + throw new GorDataException("Failed to load CRAM reference: " + md5, e); + } + } + + @Override + public byte[] getReferenceBasesByRegion( + final SAMSequenceRecord sequenceRecord, + final int zeroBasedStart, + final int requestedRegionLength) { + + ValidationUtils.validateArg(zeroBasedStart >= 0, "start must be >= 0"); + byte[] bases = getReferenceBases(sequenceRecord, false); + if (bases != null) { + if (zeroBasedStart >= bases.length) { + throw new IllegalArgumentException(String.format("Requested start %d is beyond the sequence length %s", + zeroBasedStart, + sequenceRecord.getSequenceName())); + } + return Arrays.copyOfRange(bases, zeroBasedStart, Math.min(bases.length, zeroBasedStart + requestedRegionLength)); + } + return bases; + } + + /** + * Load reference by MD5 from disk, downloading from EBI ENA if needed. + * @param record SAM record with the sequence detail (must include MD5). + * @return the bases, or EMPTY_BASES if no bases where found on disk and on EBI. + */ + protected abstract byte[] loadReference(SAMSequenceRecord record); + + @Override + public void close() throws IOException { + + } +} diff --git a/model/src/main/java/org/gorpipe/gor/model/SharedCachedReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedCachedReferenceSource.java similarity index 98% rename from model/src/main/java/org/gorpipe/gor/model/SharedCachedReferenceSource.java rename to model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedCachedReferenceSource.java index c969f5146..4253dd374 100644 --- a/model/src/main/java/org/gorpipe/gor/model/SharedCachedReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedCachedReferenceSource.java @@ -20,7 +20,7 @@ * END_COPYRIGHT */ -package org.gorpipe.gor.model; +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; diff --git a/model/src/main/java/org/gorpipe/gor/model/SharedChromSeqReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedChromSeqReferenceSource.java similarity index 97% rename from model/src/main/java/org/gorpipe/gor/model/SharedChromSeqReferenceSource.java rename to model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedChromSeqReferenceSource.java index e8c768224..eec784619 100644 --- a/model/src/main/java/org/gorpipe/gor/model/SharedChromSeqReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedChromSeqReferenceSource.java @@ -20,7 +20,7 @@ * END_COPYRIGHT */ -package org.gorpipe.gor.model; +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; import htsjdk.samtools.SAMSequenceRecord; import org.gorpipe.gor.driver.meta.DataType; diff --git a/model/src/main/java/org/gorpipe/gor/model/SharedFastaReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedFastaReferenceSource.java similarity index 98% rename from model/src/main/java/org/gorpipe/gor/model/SharedFastaReferenceSource.java rename to model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedFastaReferenceSource.java index 7c98a7103..11011ab1d 100644 --- a/model/src/main/java/org/gorpipe/gor/model/SharedFastaReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/SharedFastaReferenceSource.java @@ -20,7 +20,7 @@ * END_COPYRIGHT */ -package org.gorpipe.gor.model; +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceRecord; diff --git a/model/src/main/java/org/gorpipe/gor/table/util/PathUtils.java b/model/src/main/java/org/gorpipe/gor/table/util/PathUtils.java index fc625756b..d006a0d97 100644 --- a/model/src/main/java/org/gorpipe/gor/table/util/PathUtils.java +++ b/model/src/main/java/org/gorpipe/gor/table/util/PathUtils.java @@ -232,6 +232,10 @@ public static String removeExtensions(String fileName) { return fileName.contains(".") ? fileName.substring(0, fileName.indexOf('.')) : fileName; } + public static String removeLastExtension(String fileName) { + return fileName.contains(".") ? fileName.substring(0, fileName.lastIndexOf('.')) : fileName; + } + public static String markAsFolder(String path) { if (!path.endsWith("/")) { return path + "/"; @@ -335,6 +339,11 @@ public static String getCurrentAbsolutePath() { return Path.of("").toAbsolutePath().toString(); } + /** + * Get a temporary file path in the same folder and with similar name as the given file path. + * @param filePath the file path to base the temp file path on. + * @return the temporary file path. + */ public static Path getTempFilePath(Path filePath) { var baseName = filePath.getFileName().toString(); var dotIndex = baseName.indexOf('.'); diff --git a/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestCompositeReferenceSource.java b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestCompositeReferenceSource.java new file mode 100644 index 000000000..eb7185a8d --- /dev/null +++ b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestCompositeReferenceSource.java @@ -0,0 +1,181 @@ +/* + * BEGIN_COPYRIGHT + * + * Copyright (C) 2011-2013 deCODE genetics Inc. + * Copyright (C) 2013-2019 WuXi NextCode Inc. + * All Rights Reserved. + * + * GORpipe is free software: you can redistribute it and/or modify + * it under the terms of the AFFERO GNU General Public License as published by + * the Free Software Foundation. + * + * GORpipe is distributed "AS-IS" AND WITHOUT ANY WARRANTY OF ANY KIND, + * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, + * NON-INFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. See + * the AFFERO GNU General Public License for the complete license terms. + * + * You should have received a copy of the AFFERO GNU General Public License + * along with GORpipe. If not, see + * + * END_COPYRIGHT + */ + +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.cram.ref.CRAMReferenceSource; +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +import static org.junit.Assert.*; +import static org.mockito.Mockito.*; + +public class UTestCompositeReferenceSource { + + private CRAMReferenceSource mockSource1; + private CRAMReferenceSource mockSource2; + private CRAMReferenceSource mockSource3; + private SAMSequenceRecord testRecord; + private byte[] testBases; + + @Before + public void setUp() { + mockSource1 = mock(CRAMReferenceSource.class); + mockSource2 = mock(CRAMReferenceSource.class); + mockSource3 = mock(CRAMReferenceSource.class); + testRecord = new SAMSequenceRecord("chr1", 1000); + testBases = "ACGTACGT".getBytes(); + } + + @Test + public void getReferenceBasesReturnsResultFromSingleSource() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertArrayEquals(testBases, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesReturnsResultFromFirstSourceWhenMultipleSources() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Arrays.asList(mockSource1, mockSource2)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertArrayEquals(testBases, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, false); + verify(mockSource2, never()).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesFallsBackToSecondSourceWhenFirstFails() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(null); + when(mockSource2.getReferenceBases(testRecord, false)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Arrays.asList(mockSource1, mockSource2)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertArrayEquals(testBases, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, false); + verify(mockSource2, times(1)).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesThrowsExceptionWhenAllSourcesFail() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(null); + when(mockSource2.getReferenceBases(testRecord, false)).thenReturn(null); + when(mockSource3.getReferenceBases(testRecord, false)).thenReturn(null); + + CompositeReferenceSource composite = new CompositeReferenceSource(Arrays.asList(mockSource1, mockSource2, mockSource3)); + + assertEquals(null, composite.getReferenceBases(testRecord, false)); + } + + @Test + public void getReferenceBasesPassesTryNameVariantsParameter() { + when(mockSource1.getReferenceBases(testRecord, true)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + byte[] result = composite.getReferenceBases(testRecord, true); + + assertArrayEquals(testBases, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, true); + } + + + @Test + public void getReferenceBasesThrowsExceptionWhenSourcesListIsEmpty() { + CompositeReferenceSource composite = new CompositeReferenceSource(new ArrayList<>()); + assertEquals(null, composite.getReferenceBases(testRecord, false)); + } + + @Test + public void getReferenceBasesHandlesDifferentSequences() { + SAMSequenceRecord record1 = new SAMSequenceRecord("chr1", 1000); + SAMSequenceRecord record2 = new SAMSequenceRecord("chr2", 2000); + byte[] bases1 = "ACGT".getBytes(); + byte[] bases2 = "TGCA".getBytes(); + + when(mockSource1.getReferenceBases(record1, false)).thenReturn(bases1); + when(mockSource1.getReferenceBases(record2, false)).thenReturn(bases2); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + assertArrayEquals(bases1, composite.getReferenceBases(record1, false)); + assertArrayEquals(bases2, composite.getReferenceBases(record2, false)); + } + + @Test + public void getReferenceBasesReturnsFirstSuccessfulResult() { + byte[] bases1 = "FIRST".getBytes(); + byte[] bases2 = "SECOND".getBytes(); + + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(bases1); + when(mockSource2.getReferenceBases(testRecord, false)).thenReturn(bases2); + + CompositeReferenceSource composite = new CompositeReferenceSource(Arrays.asList(mockSource1, mockSource2)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertArrayEquals(bases1, result); + verify(mockSource1, times(1)).getReferenceBases(testRecord, false); + verify(mockSource2, never()).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesCanBeCalledMultipleTimes() { + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(testBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + assertArrayEquals(testBases, composite.getReferenceBases(testRecord, false)); + assertArrayEquals(testBases, composite.getReferenceBases(testRecord, false)); + verify(mockSource1, times(2)).getReferenceBases(testRecord, false); + } + + @Test + public void getReferenceBasesHandlesLargeSequences() { + byte[] largeBases = new byte[10000]; + Arrays.fill(largeBases, (byte) 'A'); + + when(mockSource1.getReferenceBases(testRecord, false)).thenReturn(largeBases); + + CompositeReferenceSource composite = new CompositeReferenceSource(Collections.singletonList(mockSource1)); + + byte[] result = composite.getReferenceBases(testRecord, false); + + assertEquals(largeBases.length, result.length); + assertArrayEquals(largeBases, result); + } +} diff --git a/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestEBIReferenceSource.java b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestEBIReferenceSource.java new file mode 100644 index 000000000..f6e1ccc43 --- /dev/null +++ b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestEBIReferenceSource.java @@ -0,0 +1,245 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceRecord; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.exceptions.GorResourceException; +import org.junit.*; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Tests for FolderReferenceSource class. + * Tests folder scanning, MD5 mapping, and reference base retrieval. + */ +public class UTestEBIReferenceSource { + + @Rule + public TemporaryFolder workDir = new TemporaryFolder(); + + private File testFolder; + private EBIReferenceSource referenceSource; + + @Before + public void setUp() throws IOException { + testFolder = workDir.newFolder("ref_folder"); + System.setProperty(EBIReferenceSource.KEY_USE_CRAM_REF_DOWNLOAD, "true"); + } + + @After + public void tearDown() throws IOException { + if (referenceSource != null) { + referenceSource.close(); + } + } + + @Test + public void testConstructorWithEmptyFolder() throws IOException { + // Test with empty folder + referenceSource = new EBIReferenceSource(testFolder.getAbsolutePath()); + + // Should not throw exception, but map should be empty + Assert.assertNotNull(referenceSource); + Assert.assertEquals(0, referenceSource.getRefbasesFiles().size()); + } + + @Test(expected = GorResourceException.class) + public void testConstructorWithNonExistentFolder() { + // Test with non-existent folder + referenceSource = new EBIReferenceSource("/non/existent/path"); + } + + @Test + public void testConcurrentAccess() throws IOException, InterruptedException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new EBIReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 4); + record.setAttribute("M5", "md5_hash"); + + // Test concurrent access + Thread[] threads = new Thread[10]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread(() -> { + byte[] bases = referenceSource.getReferenceBases(record, false); + Assert.assertNotNull(bases); + }); + threads[i].start(); + } + + for (Thread thread : threads) { + thread.join(); + } + } + + @Test + public void testLoadFromRefbasesFile() throws Exception { + String md5 = "refbases_md5"; + byte[] seq = "ACGTREF".getBytes(); + this.createRefbasesFile(md5, seq); + + referenceSource = new EBIReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", seq.length).setMd5(md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertArrayEquals(seq, bases); + Assert.assertEquals(1, referenceSource.getRefbasesFiles().size()); + } + + @Test + public void testDownloadFromEBIReal() throws Exception { + String md5 = "d2ed829b8a1628d16cbeee88e88e39eb"; // hg19 chrM + + referenceSource = new EBIReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chrM").setMd5(md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertEquals(16571, bases.length); + + // The download should have persisted a refbases file + Assert.assertEquals(1, referenceSource.getRefbasesFiles().size()); + Path stored = referenceSource.getRefbasesFiles().iterator().next(); + Assert.assertEquals("md5_" + md5 + ".txt", stored.getFileName().toString()); + Assert.assertTrue(Files.exists(stored)); + Assert.assertArrayEquals(bases, Files.readAllBytes(stored)); + } + + /* + @Test + public void testDownloadFromEBIFake() throws Exception { + String md5 = "download_md5"; + // Start simple HTTP server that returns "SEQUENCE_" for path / + com.sun.net.httpserver.HttpServer server = com.sun.net.httpserver.HttpServer.create(new java.net.InetSocketAddress(0), 0); + server.createContext("/", exchange -> { + try { + String path = exchange.getRequestURI().getPath(); + String req = path.startsWith("/") && path.length() > 1 ? path.substring(1) : ""; + byte[] resp = ("SEQUENCE_" + req).getBytes(java.nio.charset.StandardCharsets.US_ASCII); + exchange.sendResponseHeaders(200, resp.length); + try (var os = exchange.getResponseBody()) { + os.write(resp); + } + } finally { + exchange.close(); + } + }); + server.start(); + + // Save and override Defaults for the test + String oldMask = htsjdk.samtools.Defaults.EBI_REFERENCE_SERVICE_URL_MASK; + boolean oldUse = htsjdk.samtools.Defaults.USE_CRAM_REF_DOWNLOAD; + try { + int port = server.getAddress().getPort(); + htsjdk.samtools.Defaults.EBI_REFERENCE_SERVICE_URL_MASK = "http://localhost:" + port + "/%s"; + htsjdk.samtools.Defaults.USE_CRAM_REF_DOWNLOAD = true; + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 100).setMd5(md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertEquals("SEQUENCE_" + md5, new String(bases, java.nio.charset.StandardCharsets.US_ASCII)); + + // The download should have persisted a refbases file + Assert.assertEquals(1, referenceSource.getRefbasesFiles().size()); + Path stored = referenceSource.getRefbasesFiles().iterator().next(); + Assert.assertTrue(Files.exists(stored)); + Assert.assertArrayEquals(bases, Files.readAllBytes(stored)); + } finally { + // restore defaults and stop server + htsjdk.samtools.Defaults.EBI_REFERENCE_SERVICE_URL_MASK = oldMask; + htsjdk.samtools.Defaults.USE_CRAM_REF_DOWNLOAD = oldUse; + server.stop(0); + } + } + + */ + + /** + * Helper method to create a FASTA file with corresponding .dict and .fai files. + * Note: This is a simplified version. In a real implementation, you would need + * to create proper FASTA index files using samtools or similar tools. + */ + private File createFastaFileWithIndexes(String fileName, String sequenceName, + String sequence, String md5) throws IOException { + // Create FASTA file + File fastaFile = new File(testFolder, fileName); + var header = ""; + try (FileWriter writer = new FileWriter(fastaFile)) { + header = ">" + sequenceName; + if (md5 != null && !md5.isEmpty()) { + header += " MD5:" + md5; + } + header += "\n"; + writer.write(header); + + // Write sequence in 60 char lines (FASTA convention) + int lineLength = 60; + for (int i = 0; i < sequence.length(); i += lineLength) { + int end = Math.min(i + lineLength, sequence.length()); + writer.write(sequence.substring(i, end)); + writer.write("\n"); + } + } + + // Create .dict file (simplified - real dict files have specific format) + String baseName = fileName.substring(0, fileName.lastIndexOf('.')); + File dictFile = new File(testFolder, baseName + ".dict"); + try (FileWriter writer = new FileWriter(dictFile)) { + writer.write("@HD\tVN:1.0\n"); + writer.write("@SQ\tSN:" + sequenceName + "\tLN:" + sequence.length()); + if (md5 != null && !md5.isEmpty()) { + writer.write("\tM5:" + md5); + } + writer.write("\n"); + } + + // Create .fai file (FASTA index - simplified) + File faiFile = new File(testFolder, fileName + ".fai"); + try (FileWriter writer = new FileWriter(faiFile)) { + // Format: sequence_name, length, offset, linebases, linewidth + // This is simplified - real .fai files need proper calculation + writer.write(sequenceName + "\t" + sequence.length() + "\t" + header.length() + "\t60\t61\n"); + } + + return fastaFile; + } + + private File createRefbasesFile(String md5, byte[] bytes) throws IOException { + Path refbases = testFolder.toPath().resolve("md5_" + md5 + ".txt"); + Files.write(refbases, bytes); + return refbases.toFile(); + } + + /** + * Helper method to calculate MD5 hash of a string. + * This is a simplified version - in reality, MD5 should be calculated + * from the actual sequence bytes in the same way htsjdk does it. + */ + private String calculateMD5(String sequence) { + try { + java.security.MessageDigest md = java.security.MessageDigest.getInstance("MD5"); + byte[] digest = md.digest(sequence.getBytes()); + StringBuilder sb = new StringBuilder(); + for (byte b : digest) { + sb.append(String.format("%02x", b)); + } + return sb.toString(); + } catch (Exception e) { + return "test_md5_hash"; + } + } +} diff --git a/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestFolderReferenceSource.java b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestFolderReferenceSource.java new file mode 100644 index 000000000..d48a24675 --- /dev/null +++ b/model/src/test/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/UTestFolderReferenceSource.java @@ -0,0 +1,351 @@ +package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference; + +import htsjdk.samtools.SAMSequenceRecord; +import org.gorpipe.exceptions.GorDataException; +import org.gorpipe.exceptions.GorResourceException; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Tests for FolderReferenceSource class. + * Tests folder scanning, MD5 mapping, and reference base retrieval. + */ +public class UTestFolderReferenceSource { + + @Rule + public TemporaryFolder workDir = new TemporaryFolder(); + + private File testFolder; + private FolderReferenceSource referenceSource; + + @Before + public void setUp() throws IOException { + testFolder = workDir.newFolder("ref_folder"); + } + + @After + public void tearDown() throws IOException { + if (referenceSource != null) { + referenceSource.close(); + } + } + + @Test + public void testConstructorWithEmptyFolder() throws IOException { + // Test with empty folder + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Should not throw exception, but map should be empty + Assert.assertNotNull(referenceSource); + Assert.assertEquals(0, referenceSource.getReferenceFiles().size()); + } + + @Test(expected = GorResourceException.class) + public void testConstructorWithNonExistentFolder() { + // Test with non-existent folder + referenceSource = new FolderReferenceSource("/non/existent/path"); + } + + @Test + public void testScanFolderWithValidFastaFiles() throws IOException { + // Create a valid FASTA file with index files + File fastaFile = createFastaFileWithIndexes("test_ref.fasta", "chr1", "ACGTACGT", "a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Verify that the FASTA file was scanned + Assert.assertNotNull(referenceSource); + Assert.assertEquals(1, referenceSource.getReferenceFiles().size()); + } + + @Test(expected = GorResourceException.class) + public void testScanFolderIgnoresFastaWithoutIndexFiles() throws IOException { + // Create FASTA file without index files + File fastaFile = new File(testFolder, "no_index.fasta"); + try (FileWriter writer = new FileWriter(fastaFile)) { + writer.write(">chr1\nACGTACGT\n"); + } + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + } + + @Test + public void testScanFolderWithMultipleFastaFiles() throws IOException { + // Create multiple FASTA files + createFastaFileWithIndexes("ref1.fasta", "chr1", "ACGT", "md5_1"); + createFastaFileWithIndexes("ref2.fasta", "chr2", "TGCA", "md5_2"); + createFastaFileWithIndexes("ref3.fa", "chr3", "GCTA", "md5_3"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + Assert.assertNotNull(referenceSource); + Assert.assertEquals(3, referenceSource.getReferenceFiles().size()); + Assert.assertArrayEquals("TGCA".getBytes(), referenceSource.getReferenceBases( + new SAMSequenceRecord("chr2").setMd5("md5_2"), false)); + + } + + @Test + public void testGetReferenceBasesWithValidMD5() throws IOException { + // Create FASTA file and get its MD5 + String sequenceName = "chr1"; + String sequence = "ACGTACGTACGTACGT"; + String md5 = calculateMD5(sequence); + + File fastaFile = createFastaFileWithIndexes("test.fasta", sequenceName, sequence, md5); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Create a SAMSequenceRecord with the MD5 + SAMSequenceRecord record = new SAMSequenceRecord(sequenceName, sequence.length()).setMd5(md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertEquals(sequence.length(), bases.length); + Assert.assertEquals("ACGTACGTACGTACGT", new String(bases)); + } + + @Test + public void testGetReferenceBasesWithInvalidMD5() throws IOException { + // Create FASTA file + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "valid_md5"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Create a SAMSequenceRecord with different MD5 + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 4); + record.setAttribute("M5", "invalid_md5"); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + // Should return empty or fallback to EBI + // The behavior depends on implementation + Assert.assertNull(bases); + } + + @Test(expected = GorDataException.class) + public void testGetReferenceBasesWithNullMD5() throws IOException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 4); + // No MD5 set + + byte[] bases = referenceSource.getReferenceBases(record, false); + } + + @Test + public void testGetReferenceBasesCaseInsensitive() throws IOException { + String sequence = "acgtacgt"; // lowercase + String md5 = calculateMD5(sequence.toUpperCase()); + + File fastaFile = createFastaFileWithIndexes("test.fasta", "chr1", sequence, md5); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", sequence.length()); + record.setAttribute("M5", md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + // Bases should be converted to uppercase + Assert.assertNotNull(bases); + String basesString = new String(bases); + Assert.assertTrue(basesString.equals(basesString.toUpperCase())); + } + + @Test + public void testCloseReleasesResources() throws IOException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Close should not throw exception + referenceSource.close(); + } + + @Test + public void testMultipleCloseCalls() throws IOException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Multiple close calls should be safe + referenceSource.close(); + referenceSource.close(); + } + + @Test(expected = GorResourceException.class) + public void testScanFolderWithCorruptedFastaFile() throws IOException { + // Create a FASTA file that exists but is corrupted + File fastaFile = new File(testFolder, "corrupted.fasta"); + try (FileWriter writer = new FileWriter(fastaFile)) { + writer.write("This is not a valid FASTA file\n"); + } + + // Create index files so it gets picked up + File dictFile = new File(testFolder, "corrupted.dict"); + dictFile.createNewFile(); + File faiFile = new File(testFolder, "corrupted.fai"); + faiFile.createNewFile(); + + // Should handle corrupted files gracefully + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + } + + @Test + public void testScanFolderWithSubdirectories() throws IOException { + // Create subdirectory with FASTA file + File subDir = new File(testFolder, "subdir"); + subDir.mkdirs(); + + // Only files in the root folder should be scanned + createFastaFileWithIndexes("root.fasta", "chr1", "ACGT", "md5_1"); + + File subFasta = new File(subDir, "sub.fasta"); + try (FileWriter writer = new FileWriter(subFasta)) { + writer.write(">chr1\nACGT\n"); + } + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + // Only root.fasta should be found + Assert.assertNotNull(referenceSource); + Assert.assertEquals(1, referenceSource.getReferenceFiles().size()); + } + + @Test + public void testGetReferenceBasesWithLongSequence() throws IOException { + // Create a longer sequence + StringBuilder longSeq = new StringBuilder(); + for (int i = 0; i < 1000; i++) { + longSeq.append("ACGT"); + } + + String sequence = longSeq.toString(); + String md5 = calculateMD5(sequence); + + createFastaFileWithIndexes("long.fasta", "chr1", sequence, md5); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", sequence.length()); + record.setAttribute("M5", md5); + + byte[] bases = referenceSource.getReferenceBases(record, false); + + Assert.assertNotNull(bases); + Assert.assertEquals(sequence.length(), bases.length); + } + + @Test + public void testConcurrentAccess() throws IOException, InterruptedException { + createFastaFileWithIndexes("test.fasta", "chr1", "ACGT", "md5_hash"); + + referenceSource = new FolderReferenceSource(testFolder.getAbsolutePath()); + + SAMSequenceRecord record = new SAMSequenceRecord("chr1", 4); + record.setAttribute("M5", "md5_hash"); + + // Test concurrent access + Thread[] threads = new Thread[10]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread(() -> { + byte[] bases = referenceSource.getReferenceBases(record, false); + Assert.assertNotNull(bases); + }); + threads[i].start(); + } + + for (Thread thread : threads) { + thread.join(); + } + } + + /** + * Helper method to create a FASTA file with corresponding .dict and .fai files. + * Note: This is a simplified version. In a real implementation, you would need + * to create proper FASTA index files using samtools or similar tools. + */ + private File createFastaFileWithIndexes(String fileName, String sequenceName, + String sequence, String md5) throws IOException { + // Create FASTA file + File fastaFile = new File(testFolder, fileName); + var header = ""; + try (FileWriter writer = new FileWriter(fastaFile)) { + header = ">" + sequenceName; + if (md5 != null && !md5.isEmpty()) { + header += " MD5:" + md5; + } + header += "\n"; + writer.write(header); + + // Write sequence in 60 char lines (FASTA convention) + int lineLength = 60; + for (int i = 0; i < sequence.length(); i += lineLength) { + int end = Math.min(i + lineLength, sequence.length()); + writer.write(sequence.substring(i, end)); + writer.write("\n"); + } + } + + // Create .dict file (simplified - real dict files have specific format) + String baseName = fileName.substring(0, fileName.lastIndexOf('.')); + File dictFile = new File(testFolder, baseName + ".dict"); + try (FileWriter writer = new FileWriter(dictFile)) { + writer.write("@HD\tVN:1.0\n"); + writer.write("@SQ\tSN:" + sequenceName + "\tLN:" + sequence.length()); + if (md5 != null && !md5.isEmpty()) { + writer.write("\tM5:" + md5); + } + writer.write("\n"); + } + + // Create .fai file (FASTA index - simplified) + File faiFile = new File(testFolder, fileName + ".fai"); + try (FileWriter writer = new FileWriter(faiFile)) { + // Format: sequence_name, length, offset, linebases, linewidth + // This is simplified - real .fai files need proper calculation + writer.write(sequenceName + "\t" + sequence.length() + "\t" + header.length() + "\t60\t61\n"); + } + + return fastaFile; + } + + private File createRefbasesFile(String md5, byte[] bytes) throws IOException { + Path refbases = testFolder.toPath().resolve("md5_" + md5 + ".txt"); + Files.write(refbases, bytes); + return refbases.toFile(); + } + + /** + * Helper method to calculate MD5 hash of a string. + * This is a simplified version - in reality, MD5 should be calculated + * from the actual sequence bytes in the same way htsjdk does it. + */ + private String calculateMD5(String sequence) { + try { + java.security.MessageDigest md = java.security.MessageDigest.getInstance("MD5"); + byte[] digest = md.digest(sequence.getBytes()); + StringBuilder sb = new StringBuilder(); + for (byte b : digest) { + sb.append(String.format("%02x", b)); + } + return sb.toString(); + } catch (Exception e) { + return "test_md5_hash"; + } + } +} diff --git a/test/src/main/java/gorsat/TestUtils.java b/test/src/main/java/gorsat/TestUtils.java index 90904035c..dc1dfadec 100644 --- a/test/src/main/java/gorsat/TestUtils.java +++ b/test/src/main/java/gorsat/TestUtils.java @@ -304,6 +304,8 @@ public static PipeInstance createPipeInstance(boolean server, String gorroot, St options.add("-cachedir"); options.add(cacheDir); } +// options.add("-config"); +// options.add("../tests/data/ref_mini/gor_config.txt"); return PipeInstance.createGorIterator(new GorContext(createSession(options.toArray(String[]::new), null, server, securityContext, writeLocations))); } From 3e9f8b1686fe4cba45b4a87f95866c07147a7755 Mon Sep 17 00:00:00 2001 From: Gisli Magnusson Date: Thu, 5 Feb 2026 00:42:01 +0000 Subject: [PATCH 2/7] feat(ENGKNOW-3046): Enable multiple cram reference files. --- .github/workflows/build.yml | 4 +-- .../cram/reference/EBIReferenceSource.java | 24 ++++++++------- .../cram/reference/FolderReferenceSource.java | 2 +- .../reference/MD5CachedReferenceSource.java | 30 +++++++++++++------ 4 files changed, 37 insertions(+), 23 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6b2aa47d1..315b9af5c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -142,8 +142,8 @@ jobs: files: '**/TEST-*.xml' publishSnapshot: - if: ${{ github.ref == 'refs/heads/main' }} - needs: [test, slowTest, integrationTest] + #if: ${{ github.ref == 'refs/heads/main' }} + #needs: [test, slowTest, integrationTest] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java index d12dbfac7..6703bf469 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java @@ -81,8 +81,10 @@ private void processRefbasesFile(Path refbases) { @Override protected byte[] loadReference(final SAMSequenceRecord record) { + var md5 = record.getMd5(); + // Load from refbases file. - Path refbasesPath = md5ToRefbases.get(record.getMd5()); + Path refbasesPath = md5ToRefbases.get(md5); if (refbasesPath != null) { try { byte[] bases = Files.readAllBytes(refbasesPath); @@ -96,21 +98,21 @@ protected byte[] loadReference(final SAMSequenceRecord record) { } // Load from EBI service. - if (Boolean.parseBoolean(System.getProperty(KEY_USE_CRAM_REF_DOWNLOAD, - Boolean.toString(Defaults.USE_CRAM_REF_DOWNLOAD)))) { - try { + if (Boolean.parseBoolean(System.getProperty(KEY_USE_CRAM_REF_DOWNLOAD, "True" /*Boolean.toString(Defaults.USE_CRAM_REF_DOWNLOAD)*/))) { - byte[] bases = downloadFromEBI(record.getMd5()); - if (bases != EMPTY_BASES) { - saveRefbasesToDisk(record.getMd5(), bases); + try { + // Just use mem, this is going into mem cache anyway. + byte[] bases = downloadFromEBI(md5); + if (bases != null) { + saveRefbasesToDisk(md5, bases); } return bases; } catch (IOException e) { - log.warn("Could not download/save reference sequence for md5 " + record.getMd5(), e); + log.warn("Could not download/save reference sequence for md5 " + md5, e); } } - return EMPTY_BASES; + return null; } /** @@ -129,14 +131,14 @@ private byte[] downloadFromEBI(String md5) throws IOException { if (conn.getResponseCode() != 200) { log.warn("ENA returned {} for {}", conn.getResponseCode(), md5); - return EMPTY_BASES; + return null; } byte[] bases; try (BufferedInputStream in = new BufferedInputStream(conn.getInputStream())) { bases = in.readAllBytes(); } - if (bases.length == 0) return EMPTY_BASES; + if (bases.length == 0) return null; return bases; } diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java index 127b3ebd3..016b2715a 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/FolderReferenceSource.java @@ -77,7 +77,7 @@ protected byte[] loadReference(final SAMSequenceRecord record) { return rsFile.getSequence(referencePath.contig()).getBases(); } - return EMPTY_BASES; + return null; } private void scanReferenceFolder() { diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java index ccd6c6180..9b1196793 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java @@ -15,14 +15,11 @@ import java.io.Closeable; import java.io.IOException; import java.util.Arrays; -import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; public abstract class MD5CachedReferenceSource implements CRAMReferenceSource, Closeable { private static final Logger log = LoggerFactory.getLogger(MD5CachedReferenceSource.class); - public static final byte[] EMPTY_BASES = new byte[0]; - private static final Cache md5BasesCache = createMd5BasesCache(); private static Cache createMd5BasesCache () { @@ -46,13 +43,28 @@ public synchronized byte[] getReferenceBases(final SAMSequenceRecord record, throw new GorDataException("Can not load reference bases as SAMSequenceRecord does not contain MD5"); } - try { - var bases = md5BasesCache.get(md5, () -> loadReference(record)); + byte[] bases = md5BasesCache.getIfPresent(md5); + + if (bases == null) { + // Syncrhonize on md5 string to avoid multiple threads loading the same reference at the same time. + synchronized (md5) { + // Double check if another thread has loaded the reference while we were waiting for the lock. + bases = md5BasesCache.getIfPresent(md5); + if (bases == null) { + log.debug("Loading reference for md5 {}", md5); + bases = loadReference(record); + if (bases != null) { + md5BasesCache.put(md5, bases); + } + } + } + } + + if (bases != null) { StringUtil.toUpperCase(bases); - return bases != EMPTY_BASES ? bases : null; - }catch (ExecutionException e) { - throw new GorDataException("Failed to load CRAM reference: " + md5, e); } + return bases; + } @Override @@ -77,7 +89,7 @@ public byte[] getReferenceBasesByRegion( /** * Load reference by MD5 from disk, downloading from EBI ENA if needed. * @param record SAM record with the sequence detail (must include MD5). - * @return the bases, or EMPTY_BASES if no bases where found on disk and on EBI. + * @return the bases, or null if no bases where found on disk and on EBI. */ protected abstract byte[] loadReference(SAMSequenceRecord record); From f3c9a3a5439747c8556840b3654afe904590c77e Mon Sep 17 00:00:00 2001 From: Gisli Magnusson Date: Thu, 5 Feb 2026 02:32:26 +0000 Subject: [PATCH 3/7] feat(ENGKNOW-3046): Enable multiple cram reference files. --- .../driver/providers/stream/datatypes/cram/CramIterator.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java index fe31c7a7f..9adb85dbb 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java @@ -70,7 +70,7 @@ public class CramIterator extends BamIterator { public final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes"; public final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource"; - public final static String KEY_REFERENCE_FORCE_FOLDER = "gor.driver.cram.reference.force.folder."; + public final static String KEY_REFERENCE_FORCE_FOLDER = "gor.driver.cram.reference.force.folder"; private static final Logger log = LoggerFactory.getLogger(CramIterator.class); @@ -265,14 +265,17 @@ private File getReferenceFromReferenceLinkFile(File file) { private CRAMReferenceSource createFileReference(File refFile) { if (refFile.isDirectory()) { + log.info("Using folder reference for CRAM: {}", refFile.getPath()); return new CompositeReferenceSource(List.of( new FolderReferenceSource(refFile.getPath()), new EBIReferenceSource(refFile.getPath()))); } else if (Boolean.getBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) { + log.info("Using folder reference for CRAM: {}", refFile.getParent()); return new CompositeReferenceSource(List.of( new FolderReferenceSource(refFile.getParent()), new EBIReferenceSource(refFile.getParent()))); } else { + log.info("Using fasta reference file for CRAM: {}", refFile.getPath()); referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); String referenceKey = FilenameUtils.removeExtension(refFile.getName()); From e4f8c63c04baf3c2880d7f4fe7451631501cb243 Mon Sep 17 00:00:00 2001 From: Gisli Magnusson Date: Thu, 5 Feb 2026 03:11:01 +0000 Subject: [PATCH 4/7] feat(ENGKNOW-3046): Enable multiple cram reference files. --- .../driver/providers/stream/datatypes/cram/CramIterator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java index 9adb85dbb..94427ee7b 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java @@ -269,7 +269,7 @@ private CRAMReferenceSource createFileReference(File refFile) { return new CompositeReferenceSource(List.of( new FolderReferenceSource(refFile.getPath()), new EBIReferenceSource(refFile.getPath()))); - } else if (Boolean.getBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) { + } else if (Boolean.parseBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) { log.info("Using folder reference for CRAM: {}", refFile.getParent()); return new CompositeReferenceSource(List.of( new FolderReferenceSource(refFile.getParent()), From 566f987aa6258683e40fb9791c49c97d3797db9e Mon Sep 17 00:00:00 2001 From: Gisli Magnusson Date: Thu, 5 Feb 2026 12:22:57 +0000 Subject: [PATCH 5/7] feat(ENGKNOW-3046): Enable multiple cram reference files. --- .github/workflows/build.yml | 4 +- .../stream/datatypes/cram/CramIterator.java | 6 +- .../cram/reference/EBIReferenceSource.java | 60 +++++++++++-------- .../reference/MD5CachedReferenceSource.java | 6 +- 4 files changed, 42 insertions(+), 34 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 315b9af5c..6b2aa47d1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -142,8 +142,8 @@ jobs: files: '**/TEST-*.xml' publishSnapshot: - #if: ${{ github.ref == 'refs/heads/main' }} - #needs: [test, slowTest, integrationTest] + if: ${{ github.ref == 'refs/heads/main' }} + needs: [test, slowTest, integrationTest] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java index 94427ee7b..711cb664d 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java @@ -265,17 +265,17 @@ private File getReferenceFromReferenceLinkFile(File file) { private CRAMReferenceSource createFileReference(File refFile) { if (refFile.isDirectory()) { - log.info("Using folder reference for CRAM: {}", refFile.getPath()); + log.debug("Using folder reference for CRAM: {}", refFile.getPath()); return new CompositeReferenceSource(List.of( new FolderReferenceSource(refFile.getPath()), new EBIReferenceSource(refFile.getPath()))); } else if (Boolean.parseBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) { - log.info("Using folder reference for CRAM: {}", refFile.getParent()); + log.debug("Using folder reference for CRAM: {}", refFile.getParent()); return new CompositeReferenceSource(List.of( new FolderReferenceSource(refFile.getParent()), new EBIReferenceSource(refFile.getParent()))); } else { - log.info("Using fasta reference file for CRAM: {}", refFile.getPath()); + log.debug("Using fasta reference file for CRAM: {}", refFile.getPath()); referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); String referenceKey = FilenameUtils.removeExtension(refFile.getName()); diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java index 6703bf469..85add05a9 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/EBIReferenceSource.java @@ -2,6 +2,8 @@ import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.cram.io.InputStreamUtils; +import htsjdk.samtools.util.SequenceUtil; import org.gorpipe.exceptions.GorDataException; import org.gorpipe.exceptions.GorResourceException; import org.gorpipe.gor.table.util.PathUtils; @@ -9,13 +11,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedInputStream; import java.io.IOException; -import java.net.HttpURLConnection; +import java.io.InputStream; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashSet; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -34,6 +36,8 @@ public class EBIReferenceSource extends MD5CachedReferenceSource { protected static Map md5ToRefbases = new ConcurrentHashMap<>(); + private static final int DOWNLOAD_TRIES_BEFORE_FAILING = 2; + private Path referenceFolder; // If null we do not download. public EBIReferenceSource() { @@ -98,8 +102,7 @@ protected byte[] loadReference(final SAMSequenceRecord record) { } // Load from EBI service. - if (Boolean.parseBoolean(System.getProperty(KEY_USE_CRAM_REF_DOWNLOAD, "True" /*Boolean.toString(Defaults.USE_CRAM_REF_DOWNLOAD)*/))) { - + if (Boolean.parseBoolean(System.getProperty(KEY_USE_CRAM_REF_DOWNLOAD, "True"))) { try { // Just use mem, this is going into mem cache anyway. byte[] bases = downloadFromEBI(md5); @@ -107,7 +110,7 @@ protected byte[] loadReference(final SAMSequenceRecord record) { saveRefbasesToDisk(md5, bases); } return bases; - } catch (IOException e) { + } catch (Exception e) { log.warn("Could not download/save reference sequence for md5 " + md5, e); } } @@ -119,28 +122,35 @@ protected byte[] loadReference(final SAMSequenceRecord record) { * Download reference sequence from EBI by MD5 and store it in the reference folder. * @param md5 * @return bytes of the reference sequence, null if not found. - * @throws IOException + * @throws IOException if the sequence is not found or the download fails. */ - private byte[] downloadFromEBI(String md5) throws IOException { - log.info("Downloading reference {} from ENA", md5); - URL url = new URL(String.format(Defaults.EBI_REFERENCE_SERVICE_URL_MASK, md5)); - HttpURLConnection conn = (HttpURLConnection) url.openConnection(); - conn.setConnectTimeout(15000); - conn.setReadTimeout(30000); - conn.setRequestMethod("GET"); - - if (conn.getResponseCode() != 200) { - log.warn("ENA returned {} for {}", conn.getResponseCode(), md5); - return null; - } - - byte[] bases; - try (BufferedInputStream in = new BufferedInputStream(conn.getInputStream())) { - bases = in.readAllBytes(); + private byte[] downloadFromEBI(final String md5) throws IOException { + final String url = String.format(Locale.US, Defaults.EBI_REFERENCE_SERVICE_URL_MASK, md5); + + for (int i = 0; i < DOWNLOAD_TRIES_BEFORE_FAILING; i++) { + try (final InputStream is = new URL(url).openStream()) { + if (is == null) + return null; + + log.info("Downloading reference sequence: {}", url); + final byte[] bases = InputStreamUtils.readFully(is); + log.info("Downloaded {} bytes for md5 {}", bases.length, md5); + + final String downloadedMD5 = SequenceUtil.calculateMD5String(bases); + if (md5.equals(downloadedMD5)) { + return bases; + } else { + log.error("Downloaded sequence is corrupt: requested md5={}, received md5={}", + md5, downloadedMD5); + } + return bases; + } + catch (final IOException e) { + log.warn("Failed to download reference sequence for md5 {} on try {}/{}", + md5, (i + 1), DOWNLOAD_TRIES_BEFORE_FAILING, e); + } } - if (bases.length == 0) return null; - - return bases; + throw new IOException("Giving up on downloading sequence for md5 %s".formatted(md5)); } private void saveRefbasesToDisk(String md5, byte[] bases) throws IOException { diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java index 9b1196793..fc8c69a64 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/reference/MD5CachedReferenceSource.java @@ -54,17 +54,15 @@ public synchronized byte[] getReferenceBases(final SAMSequenceRecord record, log.debug("Loading reference for md5 {}", md5); bases = loadReference(record); if (bases != null) { + // Normalize to upper case (that is what HTSJDK impl does). + StringUtil.toUpperCase(bases); md5BasesCache.put(md5, bases); } } } } - if (bases != null) { - StringUtil.toUpperCase(bases); - } return bases; - } @Override From a4aa4b0487a24ac08a13babc4f886ec131fdc7a4 Mon Sep 17 00:00:00 2001 From: Gisli Magnusson Date: Thu, 5 Feb 2026 12:47:25 +0000 Subject: [PATCH 6/7] feat(ENGKNOW-3046): Bump version --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 332e88471..d7f89d02a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -5.8.3 +5.8.4 From 551c12f5f6a39b484dd33e4d9db04d2e4ae75f36 Mon Sep 17 00:00:00 2001 From: Gisli Magnusson Date: Thu, 5 Feb 2026 14:28:01 +0000 Subject: [PATCH 7/7] feat(ENGKNOW-3046): Enable multiple cram reference files. --- gortools/src/test/java/gorsat/UTestCram.java | 7 +- .../stream/datatypes/cram/CramIterator.java | 129 ++++++++++-------- 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/gortools/src/test/java/gorsat/UTestCram.java b/gortools/src/test/java/gorsat/UTestCram.java index 9b1d20bc9..8b54460e2 100644 --- a/gortools/src/test/java/gorsat/UTestCram.java +++ b/gortools/src/test/java/gorsat/UTestCram.java @@ -38,10 +38,9 @@ import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Paths; -import java.util.List; import static gorsat.TestUtils.LINE_SPLIT_PATTERN; -import static org.gorpipe.gor.driver.providers.stream.datatypes.cram.CramIterator.KEY_REFERENCE_FORCE_FOLDER; +import static org.gorpipe.gor.driver.providers.stream.datatypes.cram.CramIterator.KEY_REFERENCE_PREFER_FOLDER; public class UTestCram { @@ -121,7 +120,7 @@ public void readCramWithFastaReferenceFromConfigException() throws IOException { try { TestUtils.runGorPipeCount(args); } catch (GorResourceException e) { - Assert.assertTrue(e.getMessage().startsWith("Reference does not exist.")); + Assert.assertTrue(e.getMessage().startsWith("No cram reference found")); Assert.assertTrue(e.getUri().endsWith("cram_query_sorted2.fasta")); } } @@ -129,7 +128,7 @@ public void readCramWithFastaReferenceFromConfigException() throws IOException { @Test public void readCramWithFastaReferenceAndGenerateMissingAttributes() { System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA)); - System.setProperty(KEY_REFERENCE_FORCE_FOLDER, "false"); + System.setProperty(KEY_REFERENCE_PREFER_FOLDER, "false"); String[] args = new String[] {"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)}; diff --git a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java index 711cb664d..be21fd410 100644 --- a/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java +++ b/model/src/main/java/org/gorpipe/gor/driver/providers/stream/datatypes/cram/CramIterator.java @@ -70,11 +70,11 @@ public class CramIterator extends BamIterator { public final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes"; public final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource"; - public final static String KEY_REFERENCE_FORCE_FOLDER = "gor.driver.cram.reference.force.folder"; + public final static String KEY_REFERENCE_PREFER_FOLDER = "gor.driver.cram.reference.preferfolder"; private static final Logger log = LoggerFactory.getLogger(CramIterator.class); - private CramFile cramFile; + private final CramFile cramFile; private int[] columns; ChromoLookup lookup; private String fileName; @@ -164,7 +164,7 @@ public void init(GorSession session) { fileName = cramFile.getFileSource().getSourceReference().getUrl(); - referenceSource = createReferenceSource(getInitialReferenceFile(), session.getProjectContext().getRealProjectRoot()); + referenceSource = createReferenceSource(session.getProjectContext().getRealProjectRoot()); SeekableBufferedStream cramStream = new SeekableBufferedStream(new StreamSourceSeekableStream(cramFile.getFileSource())); @@ -191,14 +191,9 @@ public void init(GorSession session) { * * @return initial reference file */ - private String getInitialReferenceFile() { + private String getSourceReferenceFile() { StreamSource ref = cramFile.getReferenceSource(); - String referenceFileName = ""; - - if (ref != null) { - referenceFileName = ref.getSourceReference().getUrl(); - } - return referenceFileName; + return ref != null ? ref.getSourceReference().getUrl() : null; } private void closeReferenceFile() { @@ -211,77 +206,99 @@ private void closeReferenceFile() { } } - private CRAMReferenceSource createReferenceSource(String ref, String root) { + private CRAMReferenceSource createReferenceSource(String root) { + File file = null; + boolean forceFolder = false; + + String sourceRef = getSourceReferenceFile(); + if (!Strings.isNullOrEmpty(sourceRef)) { + file = new File(sourceRef); + } + + if (file == null) { + file = getReferenceFromReferenceLinkFile(); + } + + if (file == null) { + file = getReferenceFromGorConfig(root); + forceFolder = Boolean.parseBoolean(System.getProperty(KEY_REFERENCE_PREFER_FOLDER, "true")); + } - File file = new File(ref); - file = getReferenceFromReferenceLinkFile(file); - file = getReferenceFromGorConfig(file, root); - file = getReferenceFromGorOptions(file); + if (file == null) { + file = getReferenceFromGorOptions(); + forceFolder = Boolean.parseBoolean(System.getProperty(KEY_REFERENCE_PREFER_FOLDER, "true")); - if (!file.exists()) { - throw new GorResourceException("Reference does not exist.", file.toString()); + } + if (file == null || !file.exists()) { + throw new GorResourceException("No cram reference found: %s".formatted(file), file != null ? file.getPath() : "null"); } // This reference should be fasta but we let the htsjdk library decide - return createFileReference(file); + return createFileReference(file, forceFolder); } - private File getReferenceFromGorOptions(File file) { - if (!file.exists()) { - String refPath = System.getProperty(KEY_FASTAREFERENCESOURCE, ""); + private File getReferenceFromGorOptions() { + String refPath = System.getProperty(KEY_FASTAREFERENCESOURCE, ""); - if (!StringUtils.isEmpty(refPath)) { - return new File(refPath); - } + if (!StringUtils.isEmpty(refPath)) { + return new File(refPath); } - return file; + + return null; } - private File getReferenceFromGorConfig(File file, String root) { - if (!file.exists() && !Strings.isNullOrEmpty(projectCramReferencePath)) { + private File getReferenceFromGorConfig(String root) { + if (!Strings.isNullOrEmpty(projectCramReferencePath)) { return PathUtils.resolve(Paths.get(root), Paths.get(projectCramReferencePath)).toFile(); } - return file; + return null; } - private File getReferenceFromReferenceLinkFile(File file) { - if (!file.exists()) { - File refLinkFile = new File(this.fileName + ".ref"); + private File getReferenceFromReferenceLinkFile() { + File refLinkFile = new File(this.fileName + ".ref"); - if (refLinkFile.exists()) { - try { - List lines = FileUtils.readLines(refLinkFile, Charset.defaultCharset()); + if (refLinkFile.exists()) { + try { + List lines = FileUtils.readLines(refLinkFile, Charset.defaultCharset()); - if (lines.size() > 0) { - return new File(lines.get(0)); - } - } catch (IOException e) { - /*Do Nothing*/ + if (lines.size() > 0) { + return new File(lines.get(0)); } + } catch (IOException e) { + /*Do Nothing*/ } } - return file; + + return null; } - private CRAMReferenceSource createFileReference(File refFile) { + private CRAMReferenceSource createFileReference(File refFile, boolean preferFolder) { if (refFile.isDirectory()) { - log.debug("Using folder reference for CRAM: {}", refFile.getPath()); - return new CompositeReferenceSource(List.of( - new FolderReferenceSource(refFile.getPath()), - new EBIReferenceSource(refFile.getPath()))); - } else if (Boolean.parseBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) { - log.debug("Using folder reference for CRAM: {}", refFile.getParent()); - return new CompositeReferenceSource(List.of( - new FolderReferenceSource(refFile.getParent()), - new EBIReferenceSource(refFile.getParent()))); + return createCompositeReferenceSource(refFile); + } else if (preferFolder) { + try { + return createCompositeReferenceSource(refFile.getParentFile()); + } catch (Exception e) { + // Fallback to single file, in case none of the files contains proper meta. + return createSharedFastaReferenceSource(refFile); + } } else { - log.debug("Using fasta reference file for CRAM: {}", refFile.getPath()); - referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); - - String referenceKey = FilenameUtils.removeExtension(refFile.getName()); - var referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); - return new SharedFastaReferenceSource(referenceFile, referenceKey); + return createSharedFastaReferenceSource(refFile); } } + private CRAMReferenceSource createSharedFastaReferenceSource(File refFile) { + log.debug("Using fasta reference file for CRAM: {}", refFile.getPath()); + referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); + + String referenceKey = FilenameUtils.removeExtension(refFile.getName()); + var referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile); + return new SharedFastaReferenceSource(referenceFile, referenceKey); + } + private CRAMReferenceSource createCompositeReferenceSource(File refFolder) { + log.debug("Using folder reference for CRAM: {}", refFolder.getPath()); + return new CompositeReferenceSource(List.of( + new FolderReferenceSource(refFolder.getPath()), + new EBIReferenceSource(refFolder.getPath()))); + } }