Skip to content

Commit

Permalink
Improve ez_histograms (#662)
Browse files Browse the repository at this point in the history
* simplify rownames management

* adding test for misleading column ids (duplicates)

* switch to psych statistics

* update all tests

* Update ez_histograms.xml

* Update ez_histograms.xml
  • Loading branch information
drosofff authored Feb 8, 2024
1 parent 443759a commit 5e25392
Show file tree
Hide file tree
Showing 17 changed files with 1,176 additions and 154 deletions.
24 changes: 9 additions & 15 deletions tools/ez_histograms/ez_histograms.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ library(ggplot2)
library(reshape2)
library(dplyr)
library(scales)
library(vtable)
library(psych)
library(optparse)

options(show.error.messages = FALSE,
Expand Down Expand Up @@ -116,17 +116,9 @@ test_header <- function(file) {
}
}

test_rownames <- function(file) {
data <- read.delim(file = file, header = FALSE, row.names = NULL, nrows = 2)
if (is.na(as.numeric(data[2, 1]))) {
return(1)
} else {
return(NULL)
}
}

##### prepare input data
data <- read.delim(file = opt$file, header = test_header(opt$file), row.names = test_rownames(opt$file))

data <- read.delim(file = opt$file, header = test_header(opt$file))
data <- data %>% select(where(is.numeric)) # remove non numeric columns
mdata <- melt(data)

Expand Down Expand Up @@ -162,8 +154,10 @@ pdf(opt$pdf, width = width, height = height)
print(p + facet_wrap(~variable, ncol = ncol, scales = "free"))
dev.off()

# Summary statistics with vtable package
summary_df <- sumtable(data, digits = 8, out = "return", add.median = TRUE,
summ.names = c("N", "Mean", "Std. Dev.", "Min", "Pctl. 25",
"Median", "Pctl. 75", "Max"))
# Summary statistics with psych package
summary_df <- describe(x = data, skew = FALSE, ranges = FALSE, quant = c(.25, .50, .75))
summary_df <- cbind(var_names = rownames(summary_df), summary_df)
colnames(summary_df)[2] <- "var_num"
summary_df <- summary_df[, -6]
summary_df[, 4:8] <- format(summary_df[, 4:8], scientific = TRUE)
write.table(summary_df, file = opt$summary, sep = "\t", quote = FALSE, row.names = FALSE)
13 changes: 10 additions & 3 deletions tools/ez_histograms/ez_histograms.xml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
<tool id="ez_histograms" name="ez_histograms" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<macros>
<token name="@TOOL_VERSION@">3.4.4</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@VERSION_SUFFIX@">1</token>
<token name="@PROFILE@">23.0</token>
</macros>
<requirements>
<requirement type="package" version="3.4.4">r-ggplot2</requirement>
<requirement type="package" version="1.4.4">r-reshape2</requirement>
<requirement type="package" version="1.1.4">r-dplyr</requirement>
<requirement type="package" version="1.3.0">r-scales</requirement>
<requirement type="package" version="1.4.6">r-vtable</requirement>
<requirement type="package" version="2.4.1">r-psych</requirement>
<requirement type="package" version="1.7.4">r-optparse</requirement>
</requirements>

Expand Down Expand Up @@ -108,6 +108,13 @@
<output name="pdf" file="single_headed_col.pdf"/>
<output name="summary" file="summary_6.tsv"/>
</test>
<test expect_num_outputs="2">
<param name="file" value="misleading_ids.tsv"/>
<param name="plot_options_selector" value="density"/>
<param name="xscale" value="cartesian"/>
<output name="pdf" file="misleading_ids.pdf"/>
<output name="summary" file="summary_7.tsv"/>
</test>
</tests>
<help><![CDATA[
**What it does**
Expand Down Expand Up @@ -145,7 +152,7 @@ scales
The ez_histograms Galaxy tool returns
- A pdf file with plots faceted on three columns
- A tsv file with statistics summary of the variables generated by the R package vtables
- A tsv file with statistics summary of the variables generated by the R package psych
]]></help>
<citations>
Expand Down
Binary file modified tools/ez_histograms/test-data/classic.pdf
Binary file not shown.
Binary file modified tools/ez_histograms/test-data/count.pdf
Binary file not shown.
Binary file modified tools/ez_histograms/test-data/headless.pdf
Binary file not shown.
Binary file modified tools/ez_histograms/test-data/large.pdf
Binary file not shown.
Binary file added tools/ez_histograms/test-data/misleading_ids.pdf
Binary file not shown.
1,001 changes: 1,001 additions & 0 deletions tools/ez_histograms/test-data/misleading_ids.tsv

Large diffs are not rendered by default.

Binary file modified tools/ez_histograms/test-data/rowheadless.pdf
Binary file not shown.
Binary file modified tools/ez_histograms/test-data/single_headed_col.pdf
Binary file not shown.
14 changes: 7 additions & 7 deletions tools/ez_histograms/test-data/summary_1.tsv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Variable N Mean Std. Dev. Min Pctl. 25 Median Pctl. 75 Max
BaseMean 990 1091.5132 2625.4763 0 31.199603 249.39264 1054.0101 33915.214
log2FC 849 -0.38999858 1.9953097 -5.2835189 -2.0088992 -0.86544745 1.2737699 5.6079286
StdErr 849 0.30667747 0.1285766 0.089282127 0.1962894 0.28928034 0.39630761 0.56482292
Wald.Stats 849 -0.95820431 6.3140778 -21.603869 -5.9077253 -4.475739 5.2726083 19.585597
P.value 848 0.030010804 0.15185376 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000016518389 0.0000000000036649168 0.000000012325875 0.0000022785953 0.98727107
P.adj 813 0.00004223522 0.000078050135 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000034761298 0.000000000255376 0.00000044934286 0.000034873651 0.00028949939
var_names var_num n mean sd Q0.25 Q0.5 Q0.75
BaseMean 1 990 1.091513e+03 2.625476e+03 3.119960e+01 2.493926e+02 1.054010e+03
log2FC 2 849 -3.899986e-01 1.995310e+00 -2.008899e+00 -8.654475e-01 1.273770e+00
StdErr 3 849 3.066775e-01 1.285766e-01 1.962894e-01 2.892803e-01 3.963076e-01
Wald.Stats 4 849 -9.582043e-01 6.314078e+00 -5.907725e+00 -4.475739e+00 5.272608e+00
P.value 5 848 3.001080e-02 1.518538e-01 3.664917e-12 1.232588e-08 2.278595e-06
P.adj 6 813 4.223522e-05 7.805013e-05 2.553760e-10 4.493429e-07 3.487365e-05
32 changes: 16 additions & 16 deletions tools/ez_histograms/test-data/summary_2.tsv
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
Variable N Mean Std. Dev. Min Pctl. 25 Median Pctl. 75 Max
GCB_Mg_S2 1000 672.224 1727.7831 0 69 236.5 700 38072
GCB_Mg_S7 1000 830.225 1848.5916 0 91.5 317 857.25 34261
GCB_Mg_S12 1000 685.564 1662.1423 0 71 253 686.25 33558
GCB_S1 1000 450.654 1224.5977 0 38 152 459.5 29464
GCB_S6 1000 547.793 1344.4334 0 51 194 550 28567
GCB_S11 1000 481.844 1310.3546 0 40 152.5 462.5 29614
GW_S5 1000 698.673 1534.1178 0 63.75 237.5 697.75 26119
GW_S10 1000 573.137 1477.6949 0 49.75 176.5 504 29920
GW_S15 1000 718.612 1730.7819 0 69 233.5 676 33878
NS_S4 1000 824.02 1624.4871 0 100 336 914 28284
NS_S9 1000 704.673 1266.9907 0 93.5 305 796.25 21678
NS_S14 1000 883.989 1636.9581 0 114.75 366.5 1016 25492
Starch_S3 1000 905.783 1464.8208 1 135.75 430 1099 22970
Starch_S8 1000 973.323 1680.7842 0 138.75 426.5 1092.75 24999
Starch_S13 1000 752.171 1404.0569 0 93 310.5 846.25 22556
var_names var_num n mean sd Q0.25 Q0.5 Q0.75
GCB_Mg_S2 1 1000 6.72224e+02 1.727783e+03 6.9000e+01 2.365e+02 7.00000e+02
GCB_Mg_S7 2 1000 8.30225e+02 1.848592e+03 9.1500e+01 3.170e+02 8.57250e+02
GCB_Mg_S12 3 1000 6.85564e+02 1.662142e+03 7.1000e+01 2.530e+02 6.86250e+02
GCB_S1 4 1000 4.50654e+02 1.224598e+03 3.8000e+01 1.520e+02 4.59500e+02
GCB_S6 5 1000 5.47793e+02 1.344433e+03 5.1000e+01 1.940e+02 5.50000e+02
GCB_S11 6 1000 4.81844e+02 1.310355e+03 4.0000e+01 1.525e+02 4.62500e+02
GW_S5 7 1000 6.98673e+02 1.534118e+03 6.3750e+01 2.375e+02 6.97750e+02
GW_S10 8 1000 5.73137e+02 1.477695e+03 4.9750e+01 1.765e+02 5.04000e+02
GW_S15 9 1000 7.18612e+02 1.730782e+03 6.9000e+01 2.335e+02 6.76000e+02
NS_S4 10 1000 8.24020e+02 1.624487e+03 1.0000e+02 3.360e+02 9.14000e+02
NS_S9 11 1000 7.04673e+02 1.266991e+03 9.3500e+01 3.050e+02 7.96250e+02
NS_S14 12 1000 8.83989e+02 1.636958e+03 1.1475e+02 3.665e+02 1.01600e+03
Starch_S3 13 1000 9.05783e+02 1.464821e+03 1.3575e+02 4.300e+02 1.09900e+03
Starch_S8 14 1000 9.73323e+02 1.680784e+03 1.3875e+02 4.265e+02 1.09275e+03
Starch_S13 15 1000 7.52171e+02 1.404057e+03 9.3000e+01 3.105e+02 8.46250e+02
14 changes: 7 additions & 7 deletions tools/ez_histograms/test-data/summary_3.tsv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Variable N Mean Std. Dev. Min Pctl. 25 Median Pctl. 75 Max
V2 1000 1286.6999 2744.7392 6.5122425 100.50742 409.92217 1309.8068 33915.214
V3 1000 -0.4092237 1.9730612 -5.2835189 -2.0884579 -0.946016 1.2600548 5.6079286
V4 1000 0.30466439 0.1311528 0.089282127 0.19205135 0.28205562 0.40370597 0.56482292
V5 1000 -0.9419943 6.2085133 -21.603869 -5.7808572 -4.5659907 5.0667272 19.585597
V6 1000 0.0000018003049 0.000003260254 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000016518389 0.00000000002981881 0.000000042451763 0.0000019035803 0.000013756861
V7 1000 0.000042996682 0.00007258655 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000034761298 0.0000000025022835 0.000001784909 0.000053393931 0.00028949939
var_names var_num n mean sd Q0.25 Q0.5 Q0.75
V2 1 1000 1.286700e+03 2.744739e+03 1.005074e+02 4.099222e+02 1.309807e+03
V3 2 1000 -4.092237e-01 1.973061e+00 -2.088458e+00 -9.460160e-01 1.260055e+00
V4 3 1000 3.046644e-01 1.311528e-01 1.920513e-01 2.820556e-01 4.037060e-01
V5 4 1000 -9.419943e-01 6.208513e+00 -5.780857e+00 -4.565991e+00 5.066727e+00
V6 5 1000 1.800305e-06 3.260254e-06 2.981881e-11 4.245176e-08 1.903580e-06
V7 6 1000 4.299668e-05 7.258655e-05 2.502284e-09 1.784909e-06 5.339393e-05
Loading

0 comments on commit 5e25392

Please sign in to comment.