Skip to content

Commit

Permalink
Infer types. Smart defaults for the visualize window. Basic implement…
Browse files Browse the repository at this point in the history
…ation. (#1134)

* Implement smart suggestions for the visualize flow.

* Address JS comments.

* Implement caravel dataframe wrapper.
  • Loading branch information
bkyryliuk authored Sep 23, 2016
1 parent fc921d6 commit df89bec
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 48 deletions.
2 changes: 1 addition & 1 deletion caravel/assets/javascripts/SqlLab/components/ResultSet.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class ResultSet extends React.Component {
<div className="ResultSet">
<Table
data={results.data}
columns={results.columns}
columns={results.columns.map((col) => col.name)}
sortable
className="table table-condensed table-bordered"
filterBy={this.state.searchText}
Expand Down
45 changes: 30 additions & 15 deletions caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,25 @@ class VisualizeModal extends React.Component {
columns: {},
hints: [],
};
// update columns if possible
this.setStateFromProps();
}
componentWillMount() {
this.setStateFromProps();
}
componentDidMount() {
this.validate();
}
setStateFromProps() {
if (!this.props.query || !this.props.query.results.columns) {
return;
}
const columns = {};
this.props.query.results.columns.forEach((col) => {
columns[col.name] = col;
});
this.setState({ columns });
}
validate() {
const hints = [];
const cols = this.mergedColumns();
Expand Down Expand Up @@ -67,8 +82,8 @@ class VisualizeModal extends React.Component {
const columns = Object.assign({}, this.state.columns);
if (this.props.query && this.props.query.results.columns) {
this.props.query.results.columns.forEach((col) => {
if (columns[col] === undefined) {
columns[col] = {};
if (columns[col.name] === undefined) {
columns[col.name] = col;
}
});
}
Expand All @@ -88,39 +103,39 @@ class VisualizeModal extends React.Component {
this.setState({ datasourceName: event.target.value });
this.validate();
}
changeCheckbox(attr, col, event) {
changeCheckbox(attr, columnName, event) {
let columns = this.mergedColumns();
const column = Object.assign({}, columns[col], { [attr]: event.target.checked });
columns = Object.assign({}, columns, { [col]: column });
const column = Object.assign({}, columns[columnName], { [attr]: event.target.checked });
columns = Object.assign({}, columns, { [columnName]: column });
this.setState({ columns }, this.validate);
}
changeAggFunction(col, option) {
changeAggFunction(columnName, option) {
let columns = this.mergedColumns();
const val = (option) ? option.value : null;
const column = Object.assign({}, columns[col], { agg: val });
columns = Object.assign({}, columns, { [col]: column });
const column = Object.assign({}, columns[columnName], { agg: val });
columns = Object.assign({}, columns, { [columnName]: column });
this.setState({ columns }, this.validate);
}
render() {
if (!(this.props.query)) {
return <div />;
}
const tableData = this.props.query.results.columns.map((col) => ({
column: col,
column: col.name,
is_dimension: (
<input
type="checkbox"
onChange={this.changeCheckbox.bind(this, 'is_dim', col)}
checked={(this.state.columns[col]) ? this.state.columns[col].is_dim : false}
onChange={this.changeCheckbox.bind(this, 'is_dim', col.name)}
checked={(this.state.columns[col.name]) ? this.state.columns[col.name].is_dim : false}
className="form-control"
/>
),
is_date: (
<input
type="checkbox"
className="form-control"
onChange={this.changeCheckbox.bind(this, 'is_date', col)}
checked={(this.state.columns[col]) ? this.state.columns[col].is_date : false}
onChange={this.changeCheckbox.bind(this, 'is_date', col.name)}
checked={(this.state.columns[col.name]) ? this.state.columns[col.name].is_date : false}
/>
),
agg_func: (
Expand All @@ -132,8 +147,8 @@ class VisualizeModal extends React.Component {
{ value: 'avg', label: 'AVG(x)' },
{ value: 'count_distinct', label: 'COUNT(DISTINCT x)' },
]}
onChange={this.changeAggFunction.bind(this, col)}
value={(this.state.columns[col]) ? this.state.columns[col].agg : null}
onChange={this.changeAggFunction.bind(this, col.name)}
value={(this.state.columns[col.name]) ? this.state.columns[col.name].agg : null}
/>
),
}));
Expand Down
113 changes: 113 additions & 0 deletions caravel/dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
""" Caravel wrapper around pandas.DataFrame.
TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
TODO(bkyryliuk): recognize integer encoded enums.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import pandas as pd
import numpy as np


INFER_COL_TYPES_THRESHOLD = 95
INFER_COL_TYPES_SAMPLE_SIZE = 100


# http://pandas.pydata.org/pandas-docs/stable/internals.html#
# subclassing-pandas-data-structures
class CaravelDataFrame(object):
def __init__(self, df):
self.__df = df.where((pd.notnull(df)), None)

@property
def size(self):
return len(self.__df.index)

@property
def data(self):
return self.__df.to_dict(orient='records')

@property
def columns_dict(self):
"""Provides metadata about columns for data visualization.
:return: dict, with the fields name, type, is_date, is_dim and agg.
"""
if self.__df.empty:
return None

columns = []
sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index))
sample = self.__df
if sample_size:
sample = self.__df.sample(sample_size)
for col in self.__df.dtypes.keys():
column = {
'name': col,
'type': self.__df.dtypes[col].name,
'is_date': is_date(self.__df.dtypes[col]),
'is_dim': is_dimension(self.__df.dtypes[col], col),
}
agg = agg_func(self.__df.dtypes[col], col)
if agg_func:
column['agg'] = agg

if column['type'] == 'object':
# check if encoded datetime
if (datetime_conversion_rate(sample[col]) >
INFER_COL_TYPES_THRESHOLD):
column.update({
'type': 'datetime_string',
'is_date': True,
'is_dim': False,
'agg': None
})
# 'agg' is optional attribute
if not column['agg']:
column.pop('agg', None)
columns.append(column)

return columns


# It will give false positives on the numbers that are stored as strings.
# It is hard to distinguish integer numbers and timestamps
def datetime_conversion_rate(data_series):
success = 0
total = 0
for value in data_series:
total = total + 1
try:
pd.to_datetime(value)
success = success + 1
except Exception:
continue
return 100 * success / total


def is_date(dtype):
return dtype.name.startswith('datetime')


def is_dimension(dtype, column_name):
if is_id(column_name):
return False
return dtype == np.object or dtype == np.bool


def is_id(column_name):
return column_name.startswith('id') or column_name.endswith('id')


def agg_func(dtype, column_name):
# consider checking for key substring too.
if is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'
return None
49 changes: 22 additions & 27 deletions caravel/sql_lab.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
from datetime import datetime
import pandas as pd
import logging
import numpy
import time
import json

from caravel import app, cache, db, models, utils
from caravel import app, db, models, utils, dataframe

QueryStatus = models.QueryStatus

Expand Down Expand Up @@ -114,45 +114,40 @@ def handle_error(msg):
time.sleep(1)
polled = cursor.poll()

columns = None
data = None
cdf = None
if result_proxy.cursor:
columns = [col[0] for col in result_proxy.cursor.description]
column_names = [col[0] for col in result_proxy.cursor.description]
data = result_proxy.fetchall()
df = pd.DataFrame(data, columns=columns)
df = df.where((pd.notnull(df)), None)
# TODO consider generating tuples instead of dicts to send
# less data through the wire. The command bellow does that,
# but we'd need to align on the client side.
# data = df.values.tolist()
data = df.to_dict(orient='records')
cdf = dataframe.CaravelDataFrame(
pd.DataFrame(data, columns=column_names))
# TODO consider generating tuples instead of dicts to send
# less data through the wire. The command bellow does that,
# but we'd need to align on the client side.
# data = df.values.tolist()

query.rows = result_proxy.rowcount
query.progress = 100
query.status = QueryStatus.SUCCESS
if query.rows == -1 and data:
if query.rows == -1 and cdf:
# Presto doesn't provide result_proxy.row_count
query.rows = len(data)

# CTAs queries result in 1 cell having the # of the added rows.
query.rows = cdf.size
if query.select_as_cta:
query.select_sql = '{}'.format(database.select_star(
query.tmp_table_name, limit=query.limit))

query.end_time = utils.now_as_float()
session.commit()

payload = {
'query_id': query.id,
'status': query.status,
'data': [],
}
if query.status == models.QueryStatus.SUCCESS:
payload['data'] = data
payload['columns'] = columns
else:
payload['error'] = query.error_message
if return_results:
payload = {
'query_id': query.id,
'status': query.status,
'data': [],
}
if query.status == models.QueryStatus.SUCCESS:
payload['data'] = cdf.data if cdf else []
payload['columns'] = cdf.columns_dict if cdf else []
else:
payload['error'] = query.error_message
return payload
'''
# Hack testing using a kv store for results
Expand Down
57 changes: 52 additions & 5 deletions tests/celery_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pandas as pd

import caravel
from caravel import app, appbuilder, db, models, sql_lab, utils
from caravel import app, appbuilder, db, models, sql_lab, utils, dataframe

from .base_tests import CaravelTestCase

Expand All @@ -34,6 +34,7 @@ class CeleryConfig(object):


class UtilityFunctionTests(CaravelTestCase):

# TODO(bkyryliuk): support more cases in CTA function.
def test_create_table_as(self):
select_query = "SELECT * FROM outer_space;"
Expand Down Expand Up @@ -193,8 +194,8 @@ def test_run_sync_query(self):
result2 = self.run_sql(
1, sql_where, tmp_table='tmp_table_2', cta='true')
self.assertEqual(QueryStatus.SUCCESS, result2['query']['state'])
self.assertIsNone(result2['data'])
self.assertIsNone(result2['columns'])
self.assertEqual([], result2['data'])
self.assertEqual([], result2['columns'])
query2 = self.get_query_by_id(result2['query']['serverId'])

# Check the data in the tmp table.
Expand All @@ -208,8 +209,8 @@ def test_run_sync_query(self):
result3 = self.run_sql(
1, sql_empty_result, tmp_table='tmp_table_3', cta='true',)
self.assertEqual(QueryStatus.SUCCESS, result3['query']['state'])
self.assertIsNone(result3['data'])
self.assertIsNone(result3['columns'])
self.assertEqual([], result3['data'])
self.assertEqual([], result3['columns'])

query3 = self.get_query_by_id(result3['query']['serverId'])
self.assertEqual(QueryStatus.SUCCESS, query3.status)
Expand Down Expand Up @@ -250,6 +251,52 @@ def test_run_async_query(self):
self.assertEqual(True, query1.select_as_cta)
self.assertEqual(True, query1.select_as_cta_used)

def test_get_columns_dict(self):
main_db = db.session.query(models.Database).filter_by(
database_name='main').first()
df = main_db.get_df("SELECT * FROM multiformat_time_series", None)
cdf = dataframe.CaravelDataFrame(df)
if main_db.sqlalchemy_uri.startswith('sqlite'):
self.assertEqual(
[{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'datetime_string', 'name': 'ds2',
'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'datetime_string', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'datetime_string', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string3', 'is_dim': True}]
, cdf.columns_dict
)
else:
self.assertEqual(
[{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'datetime64[ns]',
'name': 'ds2', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'datetime_string', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'datetime_string', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string3', 'is_dim': True}]
, cdf.columns_dict
)


if __name__ == '__main__':
unittest.main()

0 comments on commit df89bec

Please sign in to comment.