Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for random reading of vcf file. #180

Merged
merged 5 commits into from
Nov 13, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/cljam/io/bam_index.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
(ns cljam.io.bam-index
"Parser for a BAM index file."
(:require [cljam.io.bam-index.core :as bai-core]))
(:require
[cljam.io.bam-index.core :as bai-core]
[cljam.io.util.bin :as util-bin]))

(defn bin-index
"Returns binning index for the given reference index."
Expand All @@ -15,7 +17,7 @@
(defn get-spans
"Returns regions of a BAM file that may contain an alignment for the given range."
[bai ref-idx beg end]
(bai-core/get-spans bai ref-idx beg end))
(util-bin/get-spans bai ref-idx beg end))

(defn bam-index
"Returns a cljam.bam-index.core.BAMIndex."
Expand Down
57 changes: 18 additions & 39 deletions src/cljam/io/bam_index/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,22 @@
"The core of BAM index features."
(:require [clojure.java.io :as cio]
[clojure.tools.logging :as logging]
[cljam.io.bam-index [common :refer :all]
[chunk :as chunk]
[reader :as reader]
[writer :as writer]]
[cljam.util :as util])
[cljam.io.bam-index [reader :as reader]
[writer :as writer]]
[cljam.util :as util]
[cljam.io.util.bin :as util-bin])
(:import [java.io DataOutputStream FileOutputStream]
cljam.io.bam_index.reader.BAIReader
cljam.io.bam_index.writer.BAIWriter))

(deftype BAMIndex [url bidx lidx])
(deftype BAMIndex [url bidx lidx]
util-bin/IBinaryIndex
(get-chunks [this ref-idx bins]
(into [] (mapcat (get (.bidx this) ref-idx) bins)))
(get-min-offset [this ref-idx beg]
(util-bin/calculate-min-offset
(get (.lidx this) ref-idx)
beg)))

(defn bam-index [f]
(let [{:keys [bidx lidx]} (with-open [r ^BAIReader (reader/reader f)] (reader/read-all-index! r))]
Expand All @@ -29,39 +35,6 @@
(with-open [r ^BAIReader (reader/reader f)]
(reader/read-linear-index! r ref-idx)))

(defn- reg->bins*
"Returns candidate bins for the specified region as a vector."
[^long beg ^long end]
(let [max-pos 0x1FFFFFFF
beg (if (<= beg 0) 0 (bit-and (dec beg) max-pos))
end (if (<= end 0) max-pos (bit-and (dec end) max-pos))]
(if (<= beg end)
(loop [bins (transient [0])
xs [[1 26] [9 23] [73 20] [585 17] [4681 14]]]
(if-let [[^long ini shift] (first xs)]
(let [ini* (+ ini (bit-shift-right beg shift))
end* (+ ini (bit-shift-right end shift))]
(recur
(loop [b bins k ini*]
(if (<= k end*)
(recur (conj! b k) (inc k))
b))
(next xs)))
(persistent! bins))))))

(def ^:private reg->bins (memoize reg->bins*))

(defn get-spans
[^BAMIndex bai ^long ref-idx ^long beg ^long end]
(let [bins (reg->bins beg end)
bidx (get (.bidx bai) ref-idx)
lidx (get (.lidx bai) ref-idx)
chunks (into [] (comp (map bidx) cat) bins)
lin-beg (writer/pos->lidx-offset beg)
min-offset (get lidx lin-beg 0)]
(->> (chunk/optimize-chunks chunks min-offset)
alumi marked this conversation as resolved.
Show resolved Hide resolved
(map vals))))

(defn get-unplaced-spans
[^BAMIndex bai]
(if-let [begin (some->>
Expand All @@ -74,8 +47,14 @@
[[begin Long/MAX_VALUE]]
[]))

(defn get-spans
[^BAMIndex bai ^long ref-idx ^long beg ^long end]
(util-bin/get-spans bai ref-idx beg end))


;; ## Writing


(defn ^BAIWriter writer
[f refs]
(BAIWriter. (DataOutputStream. (FileOutputStream. (cio/file f)))
Expand Down
12 changes: 6 additions & 6 deletions src/cljam/io/bam_index/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
(:require [clojure.java.io :as cio]
[cljam.io.util.lsb :as lsb]
[cljam.io.bam-index.common :refer [bai-magic]]
[cljam.io.bam-index.chunk :as chunk]
[cljam.io.util.chunk :as chunk]
[cljam.util :as util])
(:import java.util.Arrays
[java.io FileInputStream Closeable IOException]
[cljam.io.bam_index.chunk Chunk]))
[cljam.io.util.chunk Chunk]))

(deftype BAIReader [url reader]
Closeable
Expand Down Expand Up @@ -49,10 +49,10 @@
(defn- read-chunks!
[rdr]
(let [n (lsb/read-int rdr)]
(loop [i 0, chunks []]
(if (< i n)
(recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr))))
chunks))))
(loop [i 0, chunks []]
(if (< i n)
(recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr))))
chunks))))

(defn- read-bin-index**!
[rdr]
Expand Down
4 changes: 2 additions & 2 deletions src/cljam/io/bam_index/writer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
[cljam.io.util.bgzf :as bgzf]
[cljam.io.util.lsb :as lsb]
[cljam.io.bam-index.common :refer :all]
[cljam.io.bam-index.chunk :as chunk]
[cljam.io.util.chunk :as chunk]
[cljam.io.bam.decoder :as bam-decoder])
(:import [java.io DataOutputStream Closeable]
[java.nio ByteBuffer ByteOrder]
[cljam.io.bam.decoder BAMPointerBlock]
[cljam.io.bam_index.chunk Chunk]))
[cljam.io.util.chunk Chunk]))

(declare make-index-from-blocks)

Expand Down
90 changes: 52 additions & 38 deletions src/cljam/io/tabix.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,58 @@
"Alpha - subject to change.
Reader of a TABIX format file."
(:require [cljam.io.util.bgzf :as bgzf]
[cljam.io.util.lsb :as lsb])
[cljam.io.util.lsb :as lsb]
[cljam.io.util.bin :as util-bin]
[clojure.string :as cstr])
(:import java.util.Arrays
[java.io DataInputStream IOException]))
[java.io DataInputStream IOException]
[cljam.io.util.chunk Chunk]))

(deftype Tabix [n-ref preset sc bc ec meta skip seq bidx lidx]
util-bin/IBinaryIndex
(get-chunks [this ref-idx bins]
(into [] (mapcat (get (.bidx this) ref-idx) bins)))
(get-min-offset [this ref-idx beg]
(util-bin/calculate-min-offset
(get (.lidx this) ref-idx)
beg))
(get-ref-index [this chr]
(.indexOf
^clojure.lang.PersistentVector
(.seq this)
chr)))

(def tabix-magic "TBI\1")

(defn- read-chunks!
[rdr]
(->> #(Chunk. (lsb/read-long rdr) (lsb/read-long rdr))
(repeatedly (lsb/read-int rdr))
vec))

(defn- read-seq
[buf len]
(loop [i 0, j 0, seq* []]
(if (< i len)
(if (zero? (nth buf i))
(let [ba-len (- i j)
ba (byte-array ba-len)]
(System/arraycopy buf j ba 0 ba-len)
(recur (inc i) (inc i) (conj seq* (String. ba))))
(recur (inc i) j seq*))
seq*)))
[^bytes buf]
(cstr/split (String. buf) #"\00"))

(defn- read-chunk
[^DataInputStream rdr]
{:beg (lsb/read-long rdr)
:end (lsb/read-long rdr)})
(defn- read-bin-index
[rdr]
(->> #(hash-map
:bin (lsb/read-int rdr)
:chunks (read-chunks! rdr))
(repeatedly (lsb/read-int rdr))
vec))

(defn- read-bin
[^DataInputStream rdr]
(let [bin (lsb/read-int rdr)
n-chunk (lsb/read-int rdr)]
{:bin bin
:chunks (doall (map (fn [_] (read-chunk rdr)) (range n-chunk)))}))
(defn- read-linear-index
[rdr]
(->> #(lsb/read-long rdr)
(repeatedly (lsb/read-int rdr))
vec))

(defn- read-index*
[^DataInputStream rdr]
(when-not (Arrays/equals ^bytes (lsb/read-bytes rdr 4) (.getBytes ^String tabix-magic))
(throw (IOException. "Invalid TABIX file")))
(let [n-seq (lsb/read-int rdr)
(let [n-ref (lsb/read-int rdr)
preset (lsb/read-int rdr)
sc (lsb/read-int rdr)
bc (lsb/read-int rdr)
Expand All @@ -45,21 +62,18 @@
skip (lsb/read-int rdr)
len (lsb/read-int rdr)
buf (lsb/read-bytes rdr len)
seq (read-seq buf len)
index (loop [i 0
bin-index []
linear-index []]
(if (< i n-seq)
(let [n-bin (lsb/read-int rdr)
new-bin-index (doall (map (fn [_] (read-bin rdr)) (range n-bin)))
n-linear-index (lsb/read-int rdr)
new-linear-index (doall (map (fn [_] (lsb/read-long rdr)) (range n-linear-index)))]
(recur (inc i)
(conj bin-index new-bin-index)
(conj linear-index new-linear-index)))
[bin-index linear-index]))]
{:n-seq n-seq, :preset preset, :sc sc, :bc bc, :ec ec, :meta meta,
:skip skip, :seq seq ,:bin-index (first index), :linear-index (last index)}))
seq (read-seq buf)
refs (range n-ref)
all-idx (map (fn [_] [(read-bin-index rdr) (read-linear-index rdr)]) refs)
bidx-seq (map first all-idx)
bidx (zipmap
refs
(map (fn [bins]
(into {} (map (juxt :bin :chunks)) bins))
bidx-seq))
lidx (zipmap refs (map second all-idx))]
(->Tabix n-ref preset sc bc ec meta
skip seq bidx lidx)))

(defn read-index
[f]
Expand Down
44 changes: 44 additions & 0 deletions src/cljam/io/util/bin.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
(ns cljam.io.util.bin
(:require [cljam.io.util.chunk :as util-chunk]
[cljam.io.bam-index.writer :as bam-index-writer]))

(defn- reg->bins*
"Returns candidate bins for the specified region as a vector."
[^long beg ^long end]
(let [max-pos 0x1FFFFFFF
beg (if (<= beg 0) 0 (bit-and (dec beg) max-pos))
end (if (<= end 0) max-pos (bit-and (dec end) max-pos))]
(if (<= beg end)
(loop [bins (transient [0])
xs [[1 26] [9 23] [73 20] [585 17] [4681 14]]]
(if-let [[^long ini shift] (first xs)]
(let [ini* (+ ini (bit-shift-right beg shift))
end* (+ ini (bit-shift-right end shift))]
(recur
(loop [b bins k ini*]
(if (<= k end*)
(recur (conj! b k) (inc k))
b))
(next xs)))
(persistent! bins))))))

(def ^:private reg->bins (memoize reg->bins*))

(defprotocol IBinaryIndex
(get-chunks [this ref-idx bins])
(get-min-offset [this ref-idx beg])
(get-ref-index [this chr]))

(defn calculate-min-offset
"Calculate offset for get-spans."
[lidx beg]
(get lidx (bam-index-writer/pos->lidx-offset beg) 0))
alumi marked this conversation as resolved.
Show resolved Hide resolved

(defn get-spans
"Calculate span information for random access from ndex data such as tabix."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Calculate span information for random access from ndex data such as tabix."
"Calculate span information for random access from index data such as tabix."

[index-data ^long ref-idx ^long beg ^long end]
(let [bins (reg->bins beg end)
chunks (get-chunks index-data ref-idx bins)
min-offset (get-min-offset index-data ref-idx beg)]
(->> (util-chunk/optimize-chunks chunks min-offset)
(map vals))))
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(ns cljam.io.bam-index.chunk
(ns cljam.io.util.chunk
(:refer-clojure :exclude [compare])
(:require [cljam.io.util.bgzf :as bgzf]))

Expand Down Expand Up @@ -52,10 +52,10 @@
ret (transient [])]
(if f
(cond
(<= (.end f) min-offset) (recur r last-chunk ret)
(nil? last-chunk) (recur r f (conj! ret f))
(and (not (overlap? last-chunk f))
(not (adjacent? last-chunk f))) (recur r f (conj! ret f))
(> (.end f) (.end last-chunk)) (let [l (assoc last-chunk :end (.end f))] (recur r l (conj! (pop! ret) l)))
:else (recur r last-chunk ret))
(<= (.end f) min-offset) (recur r last-chunk ret)
(nil? last-chunk) (recur r f (conj! ret f))
(and (not (overlap? last-chunk f))
(not (adjacent? last-chunk f))) (recur r f (conj! ret f))
(> (.end f) (.end last-chunk)) (let [l (assoc last-chunk :end (.end f))] (recur r l (conj! (pop! ret) l)))
:else (recur r last-chunk ret))
(persistent! ret)))))
21 changes: 18 additions & 3 deletions src/cljam/io/vcf.clj
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
[cljam.io.vcf.reader :as vcf-reader]
[cljam.io.vcf.writer :as vcf-writer]
[cljam.io.bcf.reader :as bcf-reader]
[cljam.io.bcf.writer :as bcf-writer])
[cljam.io.bcf.writer :as bcf-writer]
[cljam.io.util.bgzf :as bgzf]
[cljam.io.tabix :as tabix])
(:import java.io.Closeable
cljam.io.vcf.reader.VCFReader
cljam.io.vcf.writer.VCFWriter
cljam.io.bcf.reader.BCFReader
cljam.io.bcf.writer.BCFWriter))
cljam.io.bcf.writer.BCFWriter
bgzf4j.BGZFInputStream))

;; Reading
;; -------
Expand All @@ -28,7 +31,11 @@
header (with-open [r (cio/reader (util/compressor-input-stream f))]
(vcf-reader/load-header r))]
(VCFReader. (util/as-url f) meta-info header
(cio/reader (util/compressor-input-stream f)))))

(if (bgzf/bgzip? f)
(bgzf/bgzf-input-stream f)
(cio/reader (util/compressor-input-stream f)))
(delay (tabix/read-index (str f ".tbi"))))))

(defn ^BCFReader bcf-reader
"Returns an open cljam.io.bcf.reader.BCFReader of f. Should be used inside
Expand Down Expand Up @@ -70,6 +77,14 @@
([rdr] (protocols/read-variants rdr))
([rdr option] (protocols/read-variants rdr option)))

(defn read-variants-randomly
"Reads variants of the VCF file randomly, returning them as a lazy sequence."
([rdr span-option depth-option]
(vcf-reader/read-variants-randomly
rdr
span-option
depth-option)))

;; Writing
;; -------

Expand Down
Loading