Skip to content

Commit

Permalink
add protocol splitting to assay file reader
Browse files Browse the repository at this point in the history
  • Loading branch information
HLWeil committed Feb 27, 2021
1 parent 50c2411 commit 1ba9aa9
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 38 deletions.
31 changes: 17 additions & 14 deletions src/ISADotNet.XLSX/AssayFile/AnnotationColumn.fs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ open System.Text.RegularExpressions
/// Functions for
module AnnotationColumn =

type SwateHeader =
type ColumnHeader =
{
HeaderString : string
Kind : string
Expand Down Expand Up @@ -42,14 +42,14 @@ module AnnotationColumn =
ontologySourceRegex.Value.Split ':'
|> fun o -> o.[0], o.[1]
else "", ""
SwateHeader.create header kind.Value (Some (OntologyAnnotation.fromString nameRegex.Value termAccession termSource)) number
ColumnHeader.create header kind.Value (Some (OntologyAnnotation.fromString nameRegex.Value termAccession termSource)) number
elif kindRegex.Success then
let kind = kindRegex.Value
let numberRegex = Regex.Match(header,numberPattern)
let number = if numberRegex.Success then Some (int numberRegex.Value) else None
SwateHeader.create header kind None number
ColumnHeader.create header kind None number
else
SwateHeader.create header header None None
ColumnHeader.create header header None None


let mergeOntology (termSourceHeaderOntology : OntologyAnnotation Option) (termAccessionHeaderOntology : OntologyAnnotation Option) =
Expand All @@ -60,17 +60,17 @@ module AnnotationColumn =
| None, None -> None


let tryParseTermSourceReferenceHeader (termHeader:SwateHeader) (header:string) =
match SwateHeader.fromStringHeader header with
let tryParseTermSourceReferenceHeader (termHeader:ColumnHeader) (header:string) =
match ColumnHeader.fromStringHeader header with
| h when h.Kind = "Term Source REF" && h.Number = termHeader.Number ->
match h.Term,termHeader.Term with
| None, None -> Some h
| Some t1, Some t2 when t1.Name = t2.Name -> Some h
| _ -> None
| _ -> None

let tryParseTermAccessionNumberHeader (termHeader:SwateHeader) (header:string) =
match SwateHeader.fromStringHeader header with
let tryParseTermAccessionNumberHeader (termHeader:ColumnHeader) (header:string) =
match ColumnHeader.fromStringHeader header with
| h when h.Kind = "Term Accession Number" && h.Number = termHeader.Number ->
match h.Term,termHeader.Term with
| None, None -> Some h
Expand All @@ -79,37 +79,40 @@ module AnnotationColumn =
| _ -> None

let tryParseParameterHeader (header:string) =
match SwateHeader.fromStringHeader header with
match ColumnHeader.fromStringHeader header with
| h when h.Kind = "Parameter" || h.Kind = "Parameter Value" ->
Some h
| _ -> None

let tryParseFactorHeader (header:string) =
match SwateHeader.fromStringHeader header with
match ColumnHeader.fromStringHeader header with
| h when h.Kind = "Factor" || h.Kind = "Factor Value" ->
Some h
| _ -> None

let tryParseCharacteristicsHeader (header:string) =
match SwateHeader.fromStringHeader header with
match ColumnHeader.fromStringHeader header with
| h when h.Kind = "Characteristics" || h.Kind = "Characteristics Value" ->
Some h
| _ -> None

let tryParseUnitHeader (header:string) =
match SwateHeader.fromStringHeader header with
match ColumnHeader.fromStringHeader header with
| h when h.Kind = "Unit" ->
Some h
| _ -> None

let tryParseSampleName (header:string) =
match SwateHeader.fromStringHeader header with
match ColumnHeader.fromStringHeader header with
| h when h.Kind = "Sample Name" ->
Some h
| _ -> None

let tryParseSourceName (header:string) =
match SwateHeader.fromStringHeader header with
match ColumnHeader.fromStringHeader header with
| h when h.Kind = "Source Name" ->
Some h
| _ -> None

let isSample header = tryParseSampleName header |> Option.isSome
let isSource header = tryParseSourceName header |> Option.isSome
6 changes: 4 additions & 2 deletions src/ISADotNet.XLSX/AssayFile/AnnotationNode.fs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ open AnnotationColumn

module AnnotationNode =

type NodeHeader = ColumnHeader seq

/// Splits the headers of an annotation table into
let splitIntoNodes (headers : seq<string>) =
headers
|> Seq.groupWhen (fun header ->
match (AnnotationColumn.SwateHeader.fromStringHeader header).Kind with
|> Seq.groupWhen false (fun header ->
match (AnnotationColumn.ColumnHeader.fromStringHeader header).Kind with
| "Unit" -> false
| "Term Source REF" -> false
| "Term Accession Number" -> false
Expand Down
78 changes: 65 additions & 13 deletions src/ISADotNet.XLSX/AssayFile/AnnotationTable.fs
Original file line number Diff line number Diff line change
@@ -1,35 +1,87 @@
namespace ISADotNet.XLSX.AssayFile

open ISADotNet.XLSX
open ISADotNet

module AnnotationTable =




let splitIntoProtocols (sheetName:string) (namedProtocols : (Protocol * seq<string>) seq) (headers : seq<string>) =
seq {
Protocol.create None (Some sheetName) None None None None None None None, headers
}
let splitBySamples (sheetName:string) (headers : seq<string>) =
let isSample header = AnnotationColumn.tryParseSampleName header |> Option.isSome
let isSource header = AnnotationColumn.tryParseSourceName header |> Option.isSome

match Seq.filter isSource headers |> Seq.length, Seq.filter isSample headers |> Seq.length with
| 1,1 | 0,1 -> headers |> Seq.singleton
| 0,2 when Seq.head headers |> isSample && Seq.last headers |> isSample -> headers |> Seq.singleton
| _ -> Seq.groupWhen false (fun header -> isSample header || isSource header) headers

let splitByNamedProtocols (namedProtocols : (Protocol * seq<string>) seq) (headers : seq<string>) =
let isSample (header:string) = header.Contains "Sample" || header.Contains "Source"

let rec loop (protocolOverlaps : (Protocol option * seq<string>) list) (namedProtocols : (Protocol * Set<string>) list) (remainingHeaders : Set<string>) =
match namedProtocols with
| _ when remainingHeaders.IsEmpty ->
protocolOverlaps
| (p,hs)::l ->
if Set.isSubset hs remainingHeaders then
loop ((Some p,Set.toSeq hs)::protocolOverlaps) l (Set.difference remainingHeaders hs)
else
loop protocolOverlaps l remainingHeaders
| [] ->
(None ,remainingHeaders |> Set.toSeq)::protocolOverlaps

let sampleColumns,otherColumns = headers |> Seq.filter (isSample) |> Seq.toList,headers |> Seq.filter (isSample>>not)

let protocolOverlaps = loop [] (namedProtocols |> Seq.map (fun (p,hs) -> p,hs |> Set.ofSeq) |> List.ofSeq) (otherColumns |> Set.ofSeq)

match sampleColumns with
| [] -> protocolOverlaps
| [s] -> protocolOverlaps |> List.map (fun (p,hs) -> p,Seq.append [s] hs)
| [s1;s2] -> protocolOverlaps |> List.map (fun (p,hs) -> p,Seq.append (Seq.append [s1] hs) [s2])
| s -> protocolOverlaps |> List.map (fun (p,hs) -> p,Seq.append hs s)

let indexProtocolsBySheetName (sheetName:string) (protocols : (Protocol * seq<string>) seq) =
let unnamedProtocolCount = protocols |> Seq.filter (fun (p,_) -> p.Name.IsNone) |> Seq.length
match unnamedProtocolCount with
| 0 -> protocols
| 1 ->
protocols
|> Seq.map (fun (p,hs) ->
if p.Name.IsNone then
{p with Name = Some sheetName},hs
else p,hs
)
| _ ->
let mutable i = 0
protocols
|> Seq.map (fun (p,hs) ->
if p.Name.IsNone then
let name = sprintf "%s_%i" sheetName i
i <- i + 1
{p with Name = Some sheetName},hs
else p,hs
)

let getProcessGetter protocolMetaData (columnGroup : seq<seq<string>>) =
let getProcessGetter protocolMetaData (nodes : seq<seq<string>>) =

let characteristics,characteristicValueGetters =
columnGroup |> Seq.choose AnnotationNode.tryGetCharacteristicGetterFunction
nodes |> Seq.choose AnnotationNode.tryGetCharacteristicGetterFunction
|> Seq.fold (fun (cl,cvl) (c,cv) -> c.Value :: cl, cv :: cvl) ([],[])
|> fun (l1,l2) -> List.rev l1, List.rev l2
let factors,factorValueGetters =
columnGroup |> Seq.choose AnnotationNode.tryGetFactorGetterFunction
nodes |> Seq.choose AnnotationNode.tryGetFactorGetterFunction
|> Seq.fold (fun (fl,fvl) (f,fv) -> f.Value :: fl, fv :: fvl) ([],[])
|> fun (l1,l2) -> List.rev l1, List.rev l2
let parameters,parameterValueGetters =
columnGroup |> Seq.choose AnnotationNode.tryGetParameterGetterFunction
nodes |> Seq.choose AnnotationNode.tryGetParameterGetterFunction
|> Seq.fold (fun (pl,pvl) (p,pv) -> p.Value :: pl, pv :: pvl) ([],[])
|> fun (l1,l2) -> List.rev l1, List.rev l2

let inputGetter,outputGetter =
match columnGroup |> Seq.tryPick AnnotationNode.tryGetSourceNameGetter with
match nodes |> Seq.tryPick AnnotationNode.tryGetSourceNameGetter with
| Some inputNameGetter ->
let outputNameGetter = columnGroup |> Seq.tryPick AnnotationNode.tryGetSampleNameGetter
let outputNameGetter = nodes |> Seq.tryPick AnnotationNode.tryGetSampleNameGetter
let inputGetter =
fun matrix i ->
Source.create
Expand All @@ -47,8 +99,8 @@ module AnnotationTable =
|> Sample
(fun matrix i -> inputGetter matrix i |> Source),outputGetter
| None ->
let inputNameGetter = columnGroup |> Seq.head |> AnnotationNode.tryGetSampleNameGetter
let outputNameGetter = columnGroup |> Seq.last |> AnnotationNode.tryGetSampleNameGetter
let inputNameGetter = nodes |> Seq.head |> AnnotationNode.tryGetSampleNameGetter
let outputNameGetter = nodes |> Seq.last |> AnnotationNode.tryGetSampleNameGetter
let inputGetter =
fun matrix i ->
Sample.create
Expand Down
22 changes: 13 additions & 9 deletions src/ISADotNet.XLSX/CollectionAux.fs
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,27 @@ module internal Seq =
///
/// For example:
/// Seq.groupWhen isOdd [3;3;2;4;1;2] = seq [[3]; [3; 2; 4]; [1; 2]]
let groupWhen f (input:seq<'a>) =
let groupWhen (withOverlap : bool) predicate (input:seq<'a>) =
use en = input.GetEnumerator()

let rec loop cont =
if en.MoveNext() then
if (f en.Current) then
let temp = en.Current
let temp = en.Current
if predicate temp then

loop (fun y ->
printfn "if: %O, temp: %O" y temp
cont
( match y with
| h::t when withOverlap -> [temp]::(temp::h)::t
| h::t -> []::(temp::h)::t
//| h::t -> [temp]::(h)::t
| [] -> [[temp]]
)
)
else
let temp = en.Current
loop (fun y ->
printfn "else: %O, temp: %O" y temp
cont
( match y with
| h::t -> (temp::h)::t
Expand All @@ -46,12 +49,13 @@ module internal Seq =
// Remove when first element is empty due to "[]::(temp::h)::t"
let tmp:seq<seq<'a>> =
match (loop id) with
| h::t -> match h with
| [] -> t
| _ -> h::t
| h::t -> match h with
| [x] when predicate x && withOverlap -> t
| [] -> t
| _ -> h::t
| [] -> []
|> Seq.cast

tmp

module internal Array =
Expand Down

0 comments on commit 1ba9aa9

Please sign in to comment.