Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 30 additions & 17 deletions gortools/src/test/java/gorsat/UTestCram.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,17 @@
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.contrib.java.lang.system.RestoreSystemProperties;
import org.junit.rules.TemporaryFolder;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Paths;
import java.util.List;

import static gorsat.TestUtils.LINE_SPLIT_PATTERN;
import static org.gorpipe.gor.driver.providers.stream.datatypes.cram.CramIterator.KEY_REFERENCE_FORCE_FOLDER;

public class UTestCram {

Expand All @@ -45,6 +50,9 @@ public class UTestCram {
@Rule
public TemporaryFolder workDir = new TemporaryFolder();

@Rule
public RestoreSystemProperties restoreSystemProperties = new RestoreSystemProperties();

public static File createWrongConfigFile(File directory) throws IOException {
return FileTestUtils.createTempFile(directory, "generic.gor",
"buildPath\t../tests/data/ref_mini/chromSeq\n" +
Expand Down Expand Up @@ -106,7 +114,7 @@ public void readCramWithFastaReferenceFromConfig() {
public void readCramWithFastaReferenceFromConfigException() throws IOException {
File wrongConfigFile = createWrongConfigFile(workDir.getRoot());
System.clearProperty("gor.driver.cram.fastareferencesource");
String[] args = new String[]{
String[] args = new String[] {
"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM),
"-config",
wrongConfigFile.getCanonicalPath()};
Expand All @@ -120,24 +128,29 @@ public void readCramWithFastaReferenceFromConfigException() throws IOException {

@Test
public void readCramWithFastaReferenceAndGenerateMissingAttributes() {
try {
System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA));
System.setProperty("gor.driver.cram.generatemissingattributes", "false");
String[] linesWithoutMissingAttributes = TestUtils.runGorPipeLines("gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM));
System.setProperty("gor.driver.cram.generatemissingattributes", "true");
String[] linesWithMissingAttributes = TestUtils.runGorPipeLines("gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM));

Assert.assertEquals(8, linesWithoutMissingAttributes.length);
Assert.assertEquals(8, linesWithMissingAttributes.length);
// See if we have the missing entry in the last column.
Assert.assertFalse(linesWithoutMissingAttributes[1].contains("NM="));
Assert.assertTrue(linesWithMissingAttributes[1].contains("NM="));
System.setProperty("gor.driver.cram.fastareferencesource", DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.FASTA));
System.setProperty(KEY_REFERENCE_FORCE_FOLDER, "false");

} finally {
System.clearProperty("gor.driver.cram.fastareferencesource");
System.clearProperty("gor.driver.cram.generatemissingattributes");
}
String[] args = new String[] {"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)};

System.setProperty("gor.driver.cram.generatemissingattributes", "false");
String[] linesWithoutMissingAttributes = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN);

System.setProperty("gor.driver.cram.generatemissingattributes", "true");
String[] linesWithMissingAttributesCramRef = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN);

args = new String[] {
"gor " + DataUtil.toFile("../tests/data/external/samtools/cram_query_sorted", DataType.CRAM)
, "-config", "../tests/data/ref_mini/gor_config.txt"};
String[] linesWithMissingAttributesProjectRef = TestUtils.runGorPipe(args, false).split(LINE_SPLIT_PATTERN);

Assert.assertEquals(8, linesWithoutMissingAttributes.length);
Assert.assertEquals(8, linesWithMissingAttributesCramRef.length);
Assert.assertEquals(8, linesWithMissingAttributesProjectRef.length);
// See if we have the missing entry in the last column.
Assert.assertFalse(linesWithoutMissingAttributes[1].contains("NM="));
Assert.assertTrue(linesWithMissingAttributesCramRef[1].contains("NM="));
Assert.assertTrue(linesWithMissingAttributesProjectRef[1].contains("NM="));
}

@Test(expected = GorResourceException.class)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,19 @@
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.gorpipe.exceptions.GorResourceException;
import org.gorpipe.gor.driver.meta.DataType;
import org.gorpipe.gor.driver.providers.stream.datatypes.bam.BamIterator;
import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.CompositeReferenceSource;
import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.EBIReferenceSource;
import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.FolderReferenceSource;
import org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference.SharedFastaReferenceSource;
import org.gorpipe.gor.model.ChromoLookup;
import org.gorpipe.gor.session.GorSession;
import org.gorpipe.gor.driver.adapters.StreamSourceSeekableStream;
import org.gorpipe.gor.driver.providers.stream.sources.StreamSource;
import org.gorpipe.gor.table.util.PathUtils;
import org.gorpipe.gor.util.DataUtil;
import org.gorpipe.gor.util.StringUtil;
import org.gorpipe.gor.model.Row;
import org.gorpipe.gor.model.SharedFastaReferenceSource;
import org.gorpipe.model.gor.iterators.RefSeq;
import org.gorpipe.util.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -66,19 +68,21 @@
*/
public class CramIterator extends BamIterator {

private final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes";
private final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource";
public final static String KEY_GENERATEMISSINGATTRIBUTES = "gor.driver.cram.generatemissingattributes";
public final static String KEY_FASTAREFERENCESOURCE = "gor.driver.cram.fastareferencesource";
public final static String KEY_REFERENCE_FORCE_FOLDER = "gor.driver.cram.reference.force.folder.";

private static final Logger log = LoggerFactory.getLogger(CramIterator.class);

private CramFile cramFile;
private int[] columns;
ChromoLookup lookup;
private String fileName;
private String cramReferencePath = "";
private CRAMFileReader cramFileReader;
private ReferenceSequenceFile referenceSequenceFile;
private String projectCramReferencePath; // Cram reference path from project context.
private RefSeq projectRefSeq; // RefSeq from project context, used for MD tag calculation.
private ReferenceSequenceFile referenceSequenceFile; // Handle to cram reference file, fallback for MD tag calculation.
private CRAMReferenceSource referenceSource;
private CRAMFileReader cramFileReader;
private boolean generateMissingCramAttributes;

/**
Expand All @@ -94,35 +98,6 @@ public CramIterator(ChromoLookup lookup, CramFile cramFile, int[] columns) {
this.lookup = lookup;
}


/**
* Construct a CramIterator
*
* @param lookup The lookup service for chromosome name to ids
* @param file The CRAM File to iterate through
*/
public CramIterator(ChromoLookup lookup, String file, String index, String reference, boolean generateMissingAttributes) {

fileName = file;
generateMissingCramAttributes = generateMissingAttributes;
File cramfile = new File(file);
File cramindex = new File(index);
if (!cramindex.exists()) {
cramindex = new File(DataUtil.toFile(file, DataType.CRAI));
}

referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(reference));
referenceSource = createReferenceSource(fileName, "");

try {
cramFileReader = new CRAMFileReader(cramfile, new FileInputStream(cramindex), referenceSource);
} catch (FileNotFoundException e) {
throw new GorResourceException("Cram file not found.", file, e);
}
SamReader samreader = new SamReader.PrimitiveSamReaderToSamReaderAdapter(cramFileReader, null);
init(lookup, samreader, true);
}

@Override
public Row next() {
Row row = super.next();
Expand All @@ -135,8 +110,16 @@ public Row next() {
boolean calculateNM = record.getIntegerAttribute(SAMTag.NM.name()) == null;

if (calculateMD) {
byte[] referenceBytes = referenceSequenceFile.getSubsequenceAt(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBases();
CramUtils.calculateMdAndNmTags(record, referenceBytes, calculateMD, calculateNM);
byte[] referenceBytes = null;
if (projectRefSeq != null) {
referenceBytes = projectRefSeq.getBases(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBytes();
} else if (referenceSequenceFile != null) {
// Fallback to the reference file used by the CRAM reader
referenceBytes = referenceSequenceFile.getSubsequenceAt(record.getContig(), record.getAlignmentStart(), record.getAlignmentEnd()).getBases();
}
if (referenceBytes != null) {
CramUtils.calculateMdAndNmTags(record, referenceBytes, calculateMD, calculateNM);
}
} else if (calculateNM) {
SequenceUtil.calculateSamNmTagFromCigar(record);
}
Expand Down Expand Up @@ -170,7 +153,10 @@ public void init(GorSession session) {
return;
}

cramReferencePath = session.getProjectContext().getReferenceBuild().getCramReferencePath();
projectCramReferencePath = session.getProjectContext().getReferenceBuild().getCramReferencePath();
if (!Strings.isNullOrEmpty(session.getProjectContext().getGorConfigFile())) {
projectRefSeq = session.getProjectContext().createRefSeq();
}

if (cramFile != null) {
// I read this property here through System.getProperty as there is no other way to pass properties to the driver
Expand Down Expand Up @@ -237,7 +223,7 @@ private CRAMReferenceSource createReferenceSource(String ref, String root) {
}

// This reference should be fasta but we let the htsjdk library decide
return createFileReference(file.toString());
return createFileReference(file);
}

private File getReferenceFromGorOptions(File file) {
Expand All @@ -252,8 +238,8 @@ private File getReferenceFromGorOptions(File file) {
}

private File getReferenceFromGorConfig(File file, String root) {
if (!file.exists() && !StringUtil.isEmpty(cramReferencePath)) {
return PathUtils.resolve(Paths.get(root), Paths.get(cramReferencePath)).toFile();
if (!file.exists() && !Strings.isNullOrEmpty(projectCramReferencePath)) {
return PathUtils.resolve(Paths.get(root), Paths.get(projectCramReferencePath)).toFile();
}
return file;
}
Expand All @@ -277,10 +263,22 @@ private File getReferenceFromReferenceLinkFile(File file) {
return file;
}

private CRAMReferenceSource createFileReference(String ref) {
String referenceKey = FilenameUtils.removeExtension(FilenameUtils.getBaseName(ref));
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(ref));
return new SharedFastaReferenceSource(referenceSequenceFile, referenceKey);
private CRAMReferenceSource createFileReference(File refFile) {
if (refFile.isDirectory()) {
return new CompositeReferenceSource(List.of(
new FolderReferenceSource(refFile.getPath()),
new EBIReferenceSource(refFile.getPath())));
} else if (Boolean.getBoolean(System.getProperty(KEY_REFERENCE_FORCE_FOLDER, "true"))) {
return new CompositeReferenceSource(List.of(
new FolderReferenceSource(refFile.getParent()),
new EBIReferenceSource(refFile.getParent())));
} else {
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);

String referenceKey = FilenameUtils.removeExtension(refFile.getName());
var referenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
return new SharedFastaReferenceSource(referenceFile, referenceKey);
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package org.gorpipe.gor.driver.providers.stream.datatypes.cram.reference;

import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.cram.ref.CRAMReferenceSource;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
* Composite reference source, tries out different reference source in order.
*/
public class CompositeReferenceSource implements CRAMReferenceSource, Closeable {

List<CRAMReferenceSource> sources;

public CompositeReferenceSource(List<CRAMReferenceSource> sources) {
this.sources = sources != null ? new ArrayList<>(sources) : new ArrayList<>();
}

@Override
public byte[] getReferenceBases(SAMSequenceRecord sequenceRecord, boolean tryNameVariants) {
byte[] bytes = null;
for (var source : sources) {
bytes = source.getReferenceBases(sequenceRecord, tryNameVariants);
if (bytes != null) {
return bytes;
}
}
return bytes;
}

@Override
public byte[] getReferenceBasesByRegion(SAMSequenceRecord sequenceRecord, int zeroBasedStart, int requestedRegionLength) {
byte[] bytes = null;
for (var source : sources) {
bytes = source.getReferenceBasesByRegion(sequenceRecord, zeroBasedStart, requestedRegionLength);
if (bytes != null) {
return bytes;
}
}
return bytes;
}

@Override
public void close() throws IOException {
for (var source : sources) {
if (source instanceof Closeable) {
((Closeable) source).close();
}
}
sources.clear();
}
}
Loading
Loading