From df89bec712cead1d2e3d3e4fcb300a889524a385 Mon Sep 17 00:00:00 2001 From: Bogdan Date: Fri, 23 Sep 2016 11:14:38 -0700 Subject: [PATCH] Infer types. Smart defaults for the visualize window. Basic implementation. (#1134) * Implement smart suggestions for the visualize flow. * Address JS comments. * Implement caravel dataframe wrapper. --- .../SqlLab/components/ResultSet.jsx | 2 +- .../SqlLab/components/VisualizeModal.jsx | 45 ++++--- caravel/dataframe.py | 113 ++++++++++++++++++ caravel/sql_lab.py | 49 ++++---- tests/celery_tests.py | 57 ++++++++- 5 files changed, 218 insertions(+), 48 deletions(-) create mode 100644 caravel/dataframe.py diff --git a/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx b/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx index 7f855e9c3ea47..247759e0894f6 100644 --- a/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx +++ b/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx @@ -72,7 +72,7 @@ class ResultSet extends React.Component {
col.name)} sortable className="table table-condensed table-bordered" filterBy={this.state.searchText} diff --git a/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx b/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx index 718a9fda232df..ee8d2cd3e9bf2 100644 --- a/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx +++ b/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx @@ -26,10 +26,25 @@ class VisualizeModal extends React.Component { columns: {}, hints: [], }; + // update columns if possible + this.setStateFromProps(); + } + componentWillMount() { + this.setStateFromProps(); } componentDidMount() { this.validate(); } + setStateFromProps() { + if (!this.props.query || !this.props.query.results.columns) { + return; + } + const columns = {}; + this.props.query.results.columns.forEach((col) => { + columns[col.name] = col; + }); + this.setState({ columns }); + } validate() { const hints = []; const cols = this.mergedColumns(); @@ -67,8 +82,8 @@ class VisualizeModal extends React.Component { const columns = Object.assign({}, this.state.columns); if (this.props.query && this.props.query.results.columns) { this.props.query.results.columns.forEach((col) => { - if (columns[col] === undefined) { - columns[col] = {}; + if (columns[col.name] === undefined) { + columns[col.name] = col; } }); } @@ -88,17 +103,17 @@ class VisualizeModal extends React.Component { this.setState({ datasourceName: event.target.value }); this.validate(); } - changeCheckbox(attr, col, event) { + changeCheckbox(attr, columnName, event) { let columns = this.mergedColumns(); - const column = Object.assign({}, columns[col], { [attr]: event.target.checked }); - columns = Object.assign({}, columns, { [col]: column }); + const column = Object.assign({}, columns[columnName], { [attr]: event.target.checked }); + columns = Object.assign({}, columns, { [columnName]: column }); this.setState({ columns }, this.validate); } - changeAggFunction(col, option) { + changeAggFunction(columnName, option) { let columns = this.mergedColumns(); const val = (option) ? option.value : null; - const column = Object.assign({}, columns[col], { agg: val }); - columns = Object.assign({}, columns, { [col]: column }); + const column = Object.assign({}, columns[columnName], { agg: val }); + columns = Object.assign({}, columns, { [columnName]: column }); this.setState({ columns }, this.validate); } render() { @@ -106,12 +121,12 @@ class VisualizeModal extends React.Component { return
; } const tableData = this.props.query.results.columns.map((col) => ({ - column: col, + column: col.name, is_dimension: ( ), @@ -119,8 +134,8 @@ class VisualizeModal extends React.Component { ), agg_func: ( @@ -132,8 +147,8 @@ class VisualizeModal extends React.Component { { value: 'avg', label: 'AVG(x)' }, { value: 'count_distinct', label: 'COUNT(DISTINCT x)' }, ]} - onChange={this.changeAggFunction.bind(this, col)} - value={(this.state.columns[col]) ? this.state.columns[col].agg : null} + onChange={this.changeAggFunction.bind(this, col.name)} + value={(this.state.columns[col.name]) ? this.state.columns[col.name].agg : null} /> ), })); diff --git a/caravel/dataframe.py b/caravel/dataframe.py new file mode 100644 index 0000000000000..4b86e80d45c2e --- /dev/null +++ b/caravel/dataframe.py @@ -0,0 +1,113 @@ +""" Caravel wrapper around pandas.DataFrame. + +TODO(bkyryliuk): add support for the conventions like: *_dim or dim_* + dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc. +TODO(bkyryliuk): recognize integer encoded enums. + +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import pandas as pd +import numpy as np + + +INFER_COL_TYPES_THRESHOLD = 95 +INFER_COL_TYPES_SAMPLE_SIZE = 100 + + +# http://pandas.pydata.org/pandas-docs/stable/internals.html# +# subclassing-pandas-data-structures +class CaravelDataFrame(object): + def __init__(self, df): + self.__df = df.where((pd.notnull(df)), None) + + @property + def size(self): + return len(self.__df.index) + + @property + def data(self): + return self.__df.to_dict(orient='records') + + @property + def columns_dict(self): + """Provides metadata about columns for data visualization. + + :return: dict, with the fields name, type, is_date, is_dim and agg. + """ + if self.__df.empty: + return None + + columns = [] + sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index)) + sample = self.__df + if sample_size: + sample = self.__df.sample(sample_size) + for col in self.__df.dtypes.keys(): + column = { + 'name': col, + 'type': self.__df.dtypes[col].name, + 'is_date': is_date(self.__df.dtypes[col]), + 'is_dim': is_dimension(self.__df.dtypes[col], col), + } + agg = agg_func(self.__df.dtypes[col], col) + if agg_func: + column['agg'] = agg + + if column['type'] == 'object': + # check if encoded datetime + if (datetime_conversion_rate(sample[col]) > + INFER_COL_TYPES_THRESHOLD): + column.update({ + 'type': 'datetime_string', + 'is_date': True, + 'is_dim': False, + 'agg': None + }) + # 'agg' is optional attribute + if not column['agg']: + column.pop('agg', None) + columns.append(column) + + return columns + + +# It will give false positives on the numbers that are stored as strings. +# It is hard to distinguish integer numbers and timestamps +def datetime_conversion_rate(data_series): + success = 0 + total = 0 + for value in data_series: + total = total + 1 + try: + pd.to_datetime(value) + success = success + 1 + except Exception: + continue + return 100 * success / total + + +def is_date(dtype): + return dtype.name.startswith('datetime') + + +def is_dimension(dtype, column_name): + if is_id(column_name): + return False + return dtype == np.object or dtype == np.bool + + +def is_id(column_name): + return column_name.startswith('id') or column_name.endswith('id') + + +def agg_func(dtype, column_name): + # consider checking for key substring too. + if is_id(column_name): + return 'count_distinct' + if np.issubdtype(dtype, np.number): + return 'sum' + return None diff --git a/caravel/sql_lab.py b/caravel/sql_lab.py index 13009f5634978..97fa11ac20203 100644 --- a/caravel/sql_lab.py +++ b/caravel/sql_lab.py @@ -2,10 +2,10 @@ from datetime import datetime import pandas as pd import logging +import numpy import time -import json -from caravel import app, cache, db, models, utils +from caravel import app, db, models, utils, dataframe QueryStatus = models.QueryStatus @@ -114,45 +114,40 @@ def handle_error(msg): time.sleep(1) polled = cursor.poll() - columns = None - data = None + cdf = None if result_proxy.cursor: - columns = [col[0] for col in result_proxy.cursor.description] + column_names = [col[0] for col in result_proxy.cursor.description] data = result_proxy.fetchall() - df = pd.DataFrame(data, columns=columns) - df = df.where((pd.notnull(df)), None) - # TODO consider generating tuples instead of dicts to send - # less data through the wire. The command bellow does that, - # but we'd need to align on the client side. - # data = df.values.tolist() - data = df.to_dict(orient='records') + cdf = dataframe.CaravelDataFrame( + pd.DataFrame(data, columns=column_names)) + # TODO consider generating tuples instead of dicts to send + # less data through the wire. The command bellow does that, + # but we'd need to align on the client side. + # data = df.values.tolist() query.rows = result_proxy.rowcount query.progress = 100 query.status = QueryStatus.SUCCESS - if query.rows == -1 and data: + if query.rows == -1 and cdf: # Presto doesn't provide result_proxy.row_count - query.rows = len(data) - - # CTAs queries result in 1 cell having the # of the added rows. + query.rows = cdf.size if query.select_as_cta: query.select_sql = '{}'.format(database.select_star( query.tmp_table_name, limit=query.limit)) - query.end_time = utils.now_as_float() session.commit() - payload = { - 'query_id': query.id, - 'status': query.status, - 'data': [], - } - if query.status == models.QueryStatus.SUCCESS: - payload['data'] = data - payload['columns'] = columns - else: - payload['error'] = query.error_message if return_results: + payload = { + 'query_id': query.id, + 'status': query.status, + 'data': [], + } + if query.status == models.QueryStatus.SUCCESS: + payload['data'] = cdf.data if cdf else [] + payload['columns'] = cdf.columns_dict if cdf else [] + else: + payload['error'] = query.error_message return payload ''' # Hack testing using a kv store for results diff --git a/tests/celery_tests.py b/tests/celery_tests.py index 1cd3401c7e361..46cef8f3b8267 100644 --- a/tests/celery_tests.py +++ b/tests/celery_tests.py @@ -14,7 +14,7 @@ import pandas as pd import caravel -from caravel import app, appbuilder, db, models, sql_lab, utils +from caravel import app, appbuilder, db, models, sql_lab, utils, dataframe from .base_tests import CaravelTestCase @@ -34,6 +34,7 @@ class CeleryConfig(object): class UtilityFunctionTests(CaravelTestCase): + # TODO(bkyryliuk): support more cases in CTA function. def test_create_table_as(self): select_query = "SELECT * FROM outer_space;" @@ -193,8 +194,8 @@ def test_run_sync_query(self): result2 = self.run_sql( 1, sql_where, tmp_table='tmp_table_2', cta='true') self.assertEqual(QueryStatus.SUCCESS, result2['query']['state']) - self.assertIsNone(result2['data']) - self.assertIsNone(result2['columns']) + self.assertEqual([], result2['data']) + self.assertEqual([], result2['columns']) query2 = self.get_query_by_id(result2['query']['serverId']) # Check the data in the tmp table. @@ -208,8 +209,8 @@ def test_run_sync_query(self): result3 = self.run_sql( 1, sql_empty_result, tmp_table='tmp_table_3', cta='true',) self.assertEqual(QueryStatus.SUCCESS, result3['query']['state']) - self.assertIsNone(result3['data']) - self.assertIsNone(result3['columns']) + self.assertEqual([], result3['data']) + self.assertEqual([], result3['columns']) query3 = self.get_query_by_id(result3['query']['serverId']) self.assertEqual(QueryStatus.SUCCESS, query3.status) @@ -250,6 +251,52 @@ def test_run_async_query(self): self.assertEqual(True, query1.select_as_cta) self.assertEqual(True, query1.select_as_cta_used) + def test_get_columns_dict(self): + main_db = db.session.query(models.Database).filter_by( + database_name='main').first() + df = main_db.get_df("SELECT * FROM multiformat_time_series", None) + cdf = dataframe.CaravelDataFrame(df) + if main_db.sqlalchemy_uri.startswith('sqlite'): + self.assertEqual( + [{'is_date': True, 'type': 'datetime_string', 'name': 'ds', + 'is_dim': False}, + {'is_date': True, 'type': 'datetime_string', 'name': 'ds2', + 'is_dim': False}, + {'agg': 'sum', 'is_date': False, 'type': 'int64', + 'name': 'epoch_ms', 'is_dim': False}, + {'agg': 'sum', 'is_date': False, 'type': 'int64', + 'name': 'epoch_s', 'is_dim': False}, + {'is_date': True, 'type': 'datetime_string', 'name': 'string0', + 'is_dim': False}, + {'is_date': False, 'type': 'object', + 'name': 'string1', 'is_dim': True}, + {'is_date': True, 'type': 'datetime_string', 'name': 'string2', + 'is_dim': False}, + {'is_date': False, 'type': 'object', + 'name': 'string3', 'is_dim': True}] + , cdf.columns_dict + ) + else: + self.assertEqual( + [{'is_date': True, 'type': 'datetime_string', 'name': 'ds', + 'is_dim': False}, + {'is_date': True, 'type': 'datetime64[ns]', + 'name': 'ds2', 'is_dim': False}, + {'agg': 'sum', 'is_date': False, 'type': 'int64', + 'name': 'epoch_ms', 'is_dim': False}, + {'agg': 'sum', 'is_date': False, 'type': 'int64', + 'name': 'epoch_s', 'is_dim': False}, + {'is_date': True, 'type': 'datetime_string', 'name': 'string0', + 'is_dim': False}, + {'is_date': False, 'type': 'object', + 'name': 'string1', 'is_dim': True}, + {'is_date': True, 'type': 'datetime_string', 'name': 'string2', + 'is_dim': False}, + {'is_date': False, 'type': 'object', + 'name': 'string3', 'is_dim': True}] + , cdf.columns_dict + ) + if __name__ == '__main__': unittest.main()