-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrplot.R
456 lines (409 loc) · 17.5 KB
/
rplot.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
# A small library for rendering bar or scatterplots of csv data to console using ascii characters.
# Call `Rscript rplot.R -h` for instructions..
# by @geotheory | geotheory.co.uk 2016
# manage input arguments
args_in = commandArgs(trailingOnly=T)
# arguments that require a following value (e.g. "-p '+'")
pars = list(sep=c('-s',','), quote=c('-q',"\"'"), pch=c('-p','*'), x=c('-c',50),
y=c('-r',20), bins=c('-b',15), X=c('-X','%'), size=c('-d',NA), asp=c('-A',1))
# split up combined arguments (e.g. '-am' for aggregate by mean)
args = c(unlist(sapply(args_in, function(a) {
if(substr(a,1,1)=='-') return(paste0('-',strsplit(substr(a,2,100),'')[[1]])) else a
}), use.names = F), '--') # '--' added in case -A included without an argument
# update pars argument update received
for(i in 1:length(pars)) {
p = pars[[i]]
if(p[1] %in% args) {
n = match(p[1], args) + 1
if(substr(args[n],1,1) != '-') {
if(p[1] == '-s') {
if(is.na(args[n])) args[n] = "" # whitespace seperator
if(args[n] == "\\t") args[n] = "\t" # string to tab char
}
pars[[i]][2] = args[n]
args = args[-n]
}
}
}
plot_args = args[substr(args, 1, 1) == '-']
field_args = args[!substr(args, 1, 1) == '-']
if('-A' %in% plot_args) if(is.na(pars$asp[2])) pars$asp[2] = 1 # when no argument provided
if(any(c('-h','--help') %in% args_in | '-h' %in% plot_args)) {
cat("**********************
****** R-PLOT ******
**********************
This library plots a scatterplot or hashbar plot (bars made of hashes!) of a csv or a similarly formatted
file or string in your console. If 2 numeric id_fields are provided a scatterplot will default, else hashbars.
Required arguments: csv file/string, then column name(s)/index(ices) (values-column last for hashbars)
NB read.table check.names=T so e.g. numeric colnames prepend 'X' and those with spaces have spaces replaced by '.'.
Use '-Pz | head' or '-O' to suppress plots and investigate data read-in.
USAGE
Example csv call - scatterplot (by column name):
\"Rscript rplot.R file.csv num_field1 num_field2\"
Example csv call - scatterplot (by column index):
\"Rscript rplot.R file.csv col#1 col#3\"
Example csv call - hashbar plot:
\"Rscript rplot.R file.csv id_field1 id_field2 value_field\"
Example csv text string call:
\"Rscript rplot.R \"$(cat file.csv)\" id_field value_field\"
For convenience you can set up an alias in .bash_profile or equivalent, e.g.
\"rplot() { Rscript /pathto/rplot.R \"$1\" ${*:2}; }\"
and call with:
\"rplot file.csv field1 field2 etc..\"
OPTIONS:
Data handling:
-n Specify no header row for input data. Use col indices instead
-s sep character for input data (default `,`). Requires value e.g. \";\" \"\t\" \"\" (inc. quotes)
-q quote character for input data. Requires value e.g. \"'\" or '\"' (inc. outer quotes)
-X Non-numeric characters to remove from numbers (other than \"{space} , $ £ € %\")
-a Aggregate (default `sum`) a hashbar plot data by its categorical variables
-m Aggregate by `mean` if `-a` selected
-M Aggregate by `median` if `-a` selected
-l Aggregate by `length` (count instances) if `-a` selected
-b Histogram bins (default 15) if `-F` selected. Requires following value
Plotting:
-o Reorder hashbar chart by value (also reorders data.frames)
-H Override default scatterplot with hashbar plot
-S Override default hashbar plot with scatterplot (NA values are removed)
-F Override default scatter/hash plot with frequency histogram (requires single numeric field)
-r Scatterplot rows/height (default 20). Requires following value
-c Scatterplot cols/width (default 50). Requires following value
-d Quick plot-size tool. Requires argument: 'l'/'s' large/small
-A Fix y/x aspect ratio. Without argument defaults to 1, otherwise value given
-p pch char (defaults: `#` hashbars, `*` scatterplots without overplotting,
`. : ■ █` scatterplots with o/p). Requires 1 char eg. `-p \".\"` (eg. with -y)
or a 4 char string eg. \".°*@\" to change overplot symbols (inc. quotes)
-R Add r2 correlation (bivariate only))
-w Remove ID col max width constraints in hash plots, and scale to full console width
-x Suppress summary in case of scatterplot
-y Suppress scatterplot point symbols (that show overplotting)
-z Suppress plot (eg. use with -P or -Q)
Other:
-h Call this help (also --help)
-O Inspect data.frame before and after numeric parsing and quit
-P Output raw data.frame to console and quit (truncated 1000 rows)
-Q Output processed data.frame to console and quit (truncated 1000 rows)
")
quit()
}
# 2 functions from {scales}: included as R's library path sometimes isn't accessible from a console call
zero_range = function (x, tol = 1000 * .Machine$double.eps) {
if (length(x) == 1) return(TRUE)
if (length(x) != 2) stop("x must be length 1 or 2")
if (any(is.na(x))) return(NA)
if (x[1] == x[2]) return(TRUE)
if (all(is.infinite(x))) return(FALSE)
m <- min(abs(x))
if (m == 0) return(FALSE)
abs((x[1] - x[2])/m) < tol
}
rescale = function (x, to=c(0,1), from, finite=T) {
if(missing(from)) from = range(x, na.rm=T)
if(zero_range(from) || zero_range(to)) return(ifelse(is.na(x), NA, mean(to)))
(x - from[1])/diff(from) * diff(to) + to[1]
}
map = function(x, n) floor(rescale(x, to=c(1,n)))
# coerce set to numeric if possible, else return FALSE
num = function(n){
if(class(n) %in% c('integer','numeric','double')) return(n)
n = gsub(paste(c(' ',',','$','£','€','%',pars$X[2]), collapse='|'), '', n)
is.num = all(!is.na(suppressWarnings(as.numeric(na.omit(n)))))
if(is.num) return(as.numeric(n))
return(F)
}
# because formatC can't quite cut it
format_num = function(x) {
if(length(unique(nchar(x)))==1 & sum(x%%1) == 0) return(x) # year
f1 = abs(x) >= 10000000
f2 = abs(x) >= 100 & abs(x) < 10000000
f3 = abs(x) >= 1 & abs(x) < 100
f4 = abs(x) >= 0.001 & abs(x) < 1
f5 = x == 0
f6 = !f1 & !f2 & !f3 & !f4 & !f5
out = x
out[f1] = formatC(x[f1], digits=2, format = "e")
out[f2] = formatC(round(x[f2], 0), digits=1, big.mark=',', format = "f", drop0trailing=T)
out[f3] = formatC(round(x[f3], 2), digits=2, big.mark=',', format = "f", drop0trailing=T)
out[f4] = formatC(x[f4], digits=3, format = "f", drop0trailing=T)
out[f5] = '0'
out[f6] = formatC(x[f6], digits=2, format = "e")
out
}
scatter_plot = function(x, y, cols=50, rows=20, pch="*", xlab="x", ylab="Y") {
y0 = y
if('-o' %in% plot_args) y = sort(as.numeric(y))
if('-A' %in% plot_args){
data_asp = diff(range(y)) / diff(range(x))
rows = ceiling(cols * data_asp * as.numeric(pars$asp[2]) * 2/5)
}
if(xlab == ylab) xlab = "Index"
if(missing(x)) x <- 1:length(y)
else x <- as.numeric(x)
symbs = c('.', ':', '■', '█')
if(nchar(pch)==4) symbs = strsplit(pch, '')[[1]]
if(nchar(pch)!=1 & nchar(pch)!=4) stop("pch must be 1 or 4 characters long")
# output processed data.frame to console
orig_dat = data.frame(x, y, stringsAsFactors=F)
names(orig_dat) = c(xlab, ylab)
if('-Q' %in% plot_args) {
print(head(orig_dat,1000))
quit()
}
if('-z' %in% plot_args) quit()
# rescale to grid and count point overplotting
if('-d' %in% plot_args){
if(pars$size[2] == 'l'){
rows = 40; cols = 100;
} else if(pars$size[2] == 's'){
rows = 10; cols = 25;
} else warning('-d parameter requires either \'l\' or \'s\' input')
}
summary = as.data.frame(table(paste(map(x,cols), map(-y,rows))), stringsAsFactors=F) # summarise
summary = data.frame(apply(cbind(do.call('rbind', strsplit(summary[[1]], split=' ')), summary$Freq),2,as.numeric)) # parse
names(summary) = c('x','y','freq')
op = max(summary$freq)
pr_labs = F; labs = NULL
if(op > 1 & !'-y' %in% plot_args) { # overplotting and not manually over-riden
if(nchar(pch)==1 & '-p' %in% plot_args) warning('Single character argument for -p is ignored when point overplotting is present except when -y selected.')
# cluster overplots to map to symbols
summary$grp = summary$freq
if(length(unique(summary$freq)) > 4) {
f = summary$grp != 1 # ignore 1s (over-plotting)
centres = unique(as.numeric(quantile(summary$grp[f], 0:2/2)))
summary$grp[f] = kmeans(summary$grp[f], centres)$cluster + 1
} else{
summary$grp = as.numeric(factor(summary$freq))
}
# symbol labels
freqs = sort(unique(summary$freq))
if(!identical(freqs, c(1,2)) & !identical(freqs, 1)) { # ie. not points representable literally by comb of '.' and ':'
pr_labs = T
# data break points
op_data = unique(summary[ order(summary$freq), 3:4 ])
op_data_rev = op_data[order(op_data$freq, decreasing=T), ]
n = length(unique(op_data$grp))
labs = data.frame(p0 = 1:n, p1 = 1:n, lab = '', x=0, stringsAsFactors=F)
for(g in unique(op_data$grp)) {
labs$p0[g] = op_data$freq[match(g, op_data$grp)] # first match in group
labs$p1[g] = op_data_rev$freq[match(g, op_data_rev$grp)] # last match in group
labs$x[g] = mean(summary$freq[summary$grp == g]) # mean of grp frequencies
}
for(i in 1:nrow(labs)) {
labs$lab[i] = ifelse(labs$p0[i] == labs$p1[i], labs$p0[i], paste0(labs$p0[i], '-', labs$p1[i]))
lab_mean = ifelse(length(grep('-', labs$lab[i]))>0, paste0(' (x̄ ', format(round(labs$x[i],1),nsmall=1), ')'), '')
labs$lab[i] = paste0(symbs[i], ' ', labs$lab[i], lab_mean)
}
labs = c('Points', labs$lab)
}
} else { # only 2 types of point
symbs = pch
summary$grp = 1
}
# output scatterplot
l = rep(' ', cols)
cat(' ', rep('_',cols), ' \n', sep='')
k = 0
for(i in 1:rows) {
dat = subset(summary, y == i)
ln = l
ln[dat$x] = symbs[dat$grp]
cat('|', ln, '| ', sep='')
if(pr_labs & i <= length(labs)) cat(' ', labs[i], sep='') # point symbol key
k = k + 1
if(k == ceiling(rows/2)) cat('', ylab) # y label
if(k == rows) if('-R' %in% plot_args & xlab!='Index') cat(' R²=', round(cor(x,y0),3), sep='')
cat('\n')
}
cat('|', rep('_',cols), '|\n', sep='')
xlab_mar = max(0, 1 + cols/2 - (nchar(xlab)/2))
cat(rep(' ', xlab_mar), xlab, '\n', sep="") # x label
# add summary unless overriden
if(!'-x' %in% plot_args) {
cat(nrows, 'data rows plotted')
if(nrow(d_orig) > nrows) cat('.', nrow(d_orig) - nrows, 'rows with NA values omitted')
cat('\n')
print(summary(orig_dat))
}
}
# report d.f. column classes
inspect_df = function (obj) {
cat('d.f. dimensions: ', dim(obj), '\n')
try({
r = NULL
for (i in 1:ncol(obj)) {
r = c(r, class(obj[[i]]))
}
names(r) = names(obj)
print(r)
}, silent = FALSE)
print(head(obj, 3))
}
# read in data
cons_width = as.integer(system('tput cols', intern=T)) - 10
cons_width = ifelse("-w" %in% plot_args, cons_width, min(100, cons_width))
txt = field_args[1]
rows = length(strsplit(txt, split='\n')[[1]])
# data from text blob argument or csv file
if('-n' %in% plot_args) header = F else header = T
if(rows == 1) d = read.table(txt, sep=pars$sep[2], stringsAsFactors=F, header=header, row.names=NULL, quote=pars$quote[2])
if(rows > 1) d = read.table(text=txt, header=header, sep=pars$sep[2], stringsAsFactors=F, quote=pars$quote[2])
if("-O" %in% plot_args){
cat('\nRaw data as read-in:\n')
inspect_df(d)
}
# parse numerics
for(i in 1:ncol(d)){
nums = num( d[,i] ) # check if numeric/coercible
if(is.numeric(nums[1])) d[,i] = nums
}
d_orig = d # backup as is
if("-O" %in% plot_args){
cat('\nNumerically parsed data:\n')
inspect_df(d)
cat('\n')
quit()
}
# output data.frame to console
if('-P' %in% plot_args){
print(head(d, 1000))
quit()
}
field_names = field_args[2:(length(field_args))]
# interpret field names - check if valid as name or column index
for(i in length(field_names):1) {
f = field_names[i]
badfield = F
match = pmatch(f, names(d))
if(is.na(match)){ # not a valid col name
if(is.numeric(num(f))){ # is possible number
f = as.numeric(f)
if(f <= ncol(d)){ # is within col index range
field_names[i] = names(d)[f] # change to col name
} else badfield = T
} else badfield = T
} else field_names[i] = names(d)[match] # in case partial match
if(badfield) {
warning(paste('fieldname not valid name or column index:', f))
quit()
field_names = field_names[-i]
}
}
id_fields = field_names[1:(length(field_names)-1)]
values_field = field_names[length(field_names)]
# omit rows with NA in plotting columns
d = na.omit(d[,c(id_fields, values_field), drop=F])
nrows = nrow(d) # to calc NA removals
# scatterplot if 2 fully numeric/NA variables or manually specified
if(!'-F' %in% plot_args & length(id_fields) == 1) {
v = d[[id_fields]]
numvals = suppressWarnings(as.numeric( v[!is.na(v)] ))
all_numeric = all(!is.na(numvals))
plot_scatter = F
if(all_numeric & !'-H' %in% plot_args) plot_scatter = T
if(!all_numeric & '-S' %in% plot_args) plot_scatter = T
if(plot_scatter) {
if(values_field == id_fields) { # ie. only a single field supplied
dat = na.omit(data.frame(x = 1:length(v), y = d[[values_field]], stringsAsFactors=F))
} else dat = na.omit(data.frame(x = suppressWarnings(as.numeric(v)), y = d[[values_field]], stringsAsFactors=F))
scatter_plot(dat$x, dat$y, cols=as.numeric(pars$x[2]), rows=as.numeric(pars$y[2]), pch=pars$pch[2], xlab=id_fields, ylab=values_field)
quit()
}
}
# aggregate
if('-a' %in% plot_args) {
fun = 'sum'
if('-m' %in% plot_args) fun = 'mean'
if('-M' %in% plot_args) fun = 'median'
if('-l' %in% plot_args) fun = 'length'
cat('Aggregate function is', fun, '\n')
if(length(id_fields) > 1) agg_list = as.list(d[,id_fields]) else agg_list = list(d[,id_fields])
if(fun == 'length') {
d = aggregate(rep(1,nrow(d)), by=agg_list, FUN=sum, na.rm=T, simplify=T)
} else d = aggregate(d[[values_field]], by=agg_list, FUN=fun, na.rm=T, simplify=T)
if(length(unique(c(id_fields, values_field))) == 1) {
values_field = fun # ie. 'length'
d[[id_fields]] = 1:nrow(d)
}
}
# rename fields if they've changed
names(d) = c(id_fields, values_field)
# reorder data hashbars
if('-o' %in% plot_args) d = d[order(d[[values_field]], decreasing=T),]
# output processed data.frame to console
if('-Q' %in% plot_args){
print(head(d,1000))
quit()
}
# histogram for single numeric variable
if('-F' %in% plot_args){
ran = range(d[[values_field]])
brks = seq(ran[1], ran[2], length.out = as.numeric(pars$bins[2]))
cuts = cut(d[[values_field]], brks, include.lowest=T)
cats = as.numeric(cuts)
labs = levels(cuts)
grps = as.data.frame(table(cats), stringsAsFactors=F)
grps$cats = as.numeric(grps$cats)
fullset = min(grps$cats):max(grps$cats)
missing = fullset[!fullset %in% grps$cats]
for(m in missing) grps = rbind(grps, data.frame(cats=m, Freq=0))
grps = grps[order(grps$cats),]
for(i in c('\\[', '\\]', '\\(')) labs = gsub(i, '', labs)
grps$means = as.numeric(lapply(sapply(labs, strsplit, split=','), function(i) mean(as.numeric(i))))
pretty_labs = pretty(grps$means, 4)
pretty_labs = pretty_labs[pretty_labs >= ran[1] & pretty_labs <= ran[2]]
ids = sapply(pretty_labs, function(x){ which(abs(grps$means-x)==min(abs(grps$means-x)))[1]} ) # closest group to assign label
grps$lab = ''
grps$lab[ids] = paste('--', pretty_labs, '--')
d = grps[,c(4,2)]
id_fields = values_field; values_field = 'frequency'
names(d) = c(id_fields, values_field)
}
# calculate column widths
field_data = list()
pos_x = 1
for(f in c(id_fields, values_field)) {
n = length(field_data) + 1
vals = d[[f]]
numerics = !is.na(suppressWarnings(as.numeric(vals)))
vals[numerics] = format_num(as.numeric(vals[numerics]))
maxlen = max(nchar(f), nchar(vals))
maxlen = ifelse("-w" %in% plot_args, maxlen, min(maxlen, 30))
vals = substr(vals, 1, maxlen)
padstr = paste0("%-", maxlen, "s")
field_data[n] = list(list(name = sprintf(padstr, substr(f, 1, maxlen)), values = sprintf(padstr, vals),
pos_start = pos_x, pos_end = pos_x + maxlen + 2))
pos_x = pos_x + maxlen + 3
char_deficit = maxlen - nchar(field_data[[n]]$values) # fix for sprintf bug that ignores special characters when padding
field_data[[n]]$values = paste0(field_data[[n]]$values, sapply(char_deficit, function(i) paste(rep(' ',i), collapse='')))
}
plot_width = cons_width - field_data[[length(field_data)]]$pos_end
values = d[[values_field]]
# scale and spacing
plot_ind = field_data[[ length(field_data) ]]$pos_end + 2
# whether scale to zero or positive/negative extreme
if(min(values) < 0 & max(values) > 0) {
ran = range(values) # scale positive to negative
} else if(min(values) >= 0) {
ran = c(0, max(values)) # all positive, scale to zero
} else{
ran = c(min(values), 0) # still plot hashbars from left axis
}
fact = (cons_width - plot_ind) / diff(ran)
#plot_values = as.integer(rescale(values, to = fact * ran)) # old
plot_values = as.integer(values * fact)
minvalue = min(values)
spaces = pmax(0, pmin(plot_values - fact * minvalue, fact * -minvalue))
hashes = fact * abs(values)
# print hashbar plot
cat(nrows, 'data rows plotted')
if('-z' %in% plot_args) quit()
if('-p' %in% plot_args) pch = pars$pch[2] else pch = '#'
if(nrow(d_orig) > nrows) cat('.', nrow(d_orig) - nrows, 'rows with NA values omitted')
cat('\n'); for(f in field_data) cat(f$name, ' '); cat('\n')
for(i in 1:length(values)) {
# id_fields
for(f in field_data) cat(f$values[i], ' ')
# hashes
cat(rep(' ', spaces[i]), sep='')
cat(rep(pch, hashes[i]), '\n', sep='')
}