diff --git a/app/models.py b/app/models.py index 1a7630a0a3554..89440f90749ee 100644 --- a/app/models.py +++ b/app/models.py @@ -1,16 +1,220 @@ from flask.ext.appbuilder import Model -from pydruid import client from datetime import timedelta -from flask.ext.appbuilder.models.mixins import AuditMixin, FileColumn +from flask.ext.appbuilder.models.mixins import AuditMixin from sqlalchemy import Column, Integer, String, ForeignKey, Text, Boolean, DateTime +from sqlalchemy import create_engine, MetaData +from sqlalchemy import Table as sqlaTable from sqlalchemy.orm import relationship -from app import get_session from dateutil.parser import parse +from pydruid import client +from pydruid.utils.filters import Dimension, Filter +from copy import deepcopy, copy import logging import json import requests +from app import db, get_session + +class Queryable(object): + @property + def column_names(self): + return sorted([c.column_name for c in self.columns]) + + @property + def groupby_column_names(self): + return sorted([c.column_name for c in self.columns if c.groupby]) + + @property + def filterable_column_names(self): + return sorted([c.column_name for c in self.columns if c.filterable]) + +class Database(Model, AuditMixin): + __tablename__ = 'databases' + id = Column(Integer, primary_key=True) + database_name = Column(String(256), unique=True) + sqlalchemy_uri = Column(String(1024)) + + def __repr__(self): + return self.database_name + + def get_sqla_engine(self): + return create_engine(self.sqlalchemy_uri) + + def get_table(self, table_name): + meta = MetaData() + return sqlaTable( + table_name, meta, + autoload=True, + autoload_with=self.get_sqla_engine()) + + +class Table(Model, AuditMixin, Queryable): + __tablename__ = 'tables' + id = Column(Integer, primary_key=True) + table_name = Column(String(256), unique=True) + default_endpoint = Column(Text) + database_id = Column( + String(256), ForeignKey('databases.id')) + database = relationship( + 'Database', backref='tables', foreign_keys=[database_id]) + + @property + def name(self): + return self.table_name + + @property + def table_link(self): + url = "/panoramix/table/{}/".format(self.id) + return '{self.table_name}'.format(**locals()) + + @property + def metrics_combo(self): + return sorted( + [ + (m.metric_name, m.verbose_name) + for m in self.metrics], + key=lambda x: x[1]) + + def query( + self, groupby, metrics, + granularity, + from_dttm, to_dttm, + limit_spec=None, + filter=None, + is_timeseries=True, + timeseries_limit=15, row_limit=None): + from pandas import read_sql_query + metrics_exprs = [ + "{} AS {}".format(m.expression, m.metric_name) + for m in self.metrics if m.metric_name in metrics] + from_dttm_iso = from_dttm.isoformat() + to_dttm_iso = to_dttm.isoformat() + + if metrics: + main_metric_expr = [m.expression for m in self.metrics if m.metric_name == metrics[0]][0] + else: + main_metric_expr = "COUNT(*)" + + select_exprs = [] + groupby_exprs = [] + + if groupby: + select_exprs = copy(groupby) + groupby_exprs = [s for s in groupby] + inner_groupby_exprs = [s for s in groupby] + select_exprs += metrics_exprs + if granularity != "all": + select_exprs += ['ds as timestamp'] + groupby_exprs += ['ds'] + + select_exprs = ",\n".join(select_exprs) + groupby_exprs = ",\n".join(groupby_exprs) + + where_clause = [ + "ds >= '{from_dttm_iso}'", + "ds < '{to_dttm_iso}'" + ] + for col, op, eq in filter: + if op in ('in', 'not in'): + l = ["'{}'".format(s) for s in eq.split(",")] + l = ", ".join(l) + op = op.upper() + where_clause.append( + "{col} {op} ({l})".format(**locals()) + ) + where_clause = " AND\n".join(where_clause).format(**locals()) + on_clause = " AND ".join(["{g} = __{g}".format(g=g) for g in groupby]) + limiting_join = "" + if timeseries_limit and groupby: + inner_select = ", ".join(["{g} as __{g}".format(g=g) for g in inner_groupby_exprs]) + inner_groupby_exprs = ", ".join(inner_groupby_exprs) + limiting_join = """ + JOIN ( + SELECT {inner_select} + FROM {self.table_name} + WHERE + {where_clause} + GROUP BY {inner_groupby_exprs} + ORDER BY {main_metric_expr} DESC + LIMIT {timeseries_limit} + ) z ON {on_clause} + """.format(**locals()) + + sql = """ + SELECT + {select_exprs} + FROM {self.table_name} + {limiting_join} + WHERE + {where_clause} + GROUP BY + {groupby_exprs} + """.format(**locals()) + df = read_sql_query( + sql=sql, + con=self.database.get_sqla_engine() + ) + return df + + + def fetch_metadata(self): + table = self.database.get_table(self.table_name) + TC = TableColumn + for col in table.columns: + dbcol = ( + db.session + .query(TC) + .filter(TC.table==self) + .filter(TC.column_name==col.name) + .first() + ) + db.session.flush() + if not dbcol: + dbcol = TableColumn(column_name=col.name) + if str(col.type) in ('VARCHAR', 'STRING'): + dbcol.groupby = True + dbcol.filterable = True + self.columns.append(dbcol) + + dbcol.type = str(col.type) + db.session.commit() + + +class SqlMetric(Model): + __tablename__ = 'sql_metrics' + id = Column(Integer, primary_key=True) + metric_name = Column(String(512)) + verbose_name = Column(String(1024)) + metric_type = Column(String(32)) + table_id = Column( + String(256), + ForeignKey('tables.id')) + table = relationship( + 'Table', backref='metrics', foreign_keys=[table_id]) + expression = Column(Text) + description = Column(Text) + + +class TableColumn(Model, AuditMixin): + __tablename__ = 'table_columns' + id = Column(Integer, primary_key=True) + table_id = Column( + String(256), + ForeignKey('tables.id')) + table = relationship('Table', backref='columns', foreign_keys=[table_id]) + column_name = Column(String(256)) + is_dttm = Column(Boolean, default=True) + is_active = Column(Boolean, default=True) + type = Column(String(32), default='') + groupby = Column(Boolean, default=False) + count_distinct = Column(Boolean, default=False) + sum = Column(Boolean, default=False) + max = Column(Boolean, default=False) + min = Column(Boolean, default=False) + filterable = Column(Boolean, default=False) + description = Column(Text, default='') + class Cluster(Model, AuditMixin): __tablename__ = 'clusters' @@ -40,13 +244,10 @@ def refresh_datasources(self): ).format(self=self) datasources = json.loads(requests.get(endpoint).text) for datasource in datasources: - #try: - Datasource.sync_to_db(datasource, self) - #except Exception as e: - # logging.exception(e) - # logging.error("Failed at syncing " + datasource) + Datasource.sync_to_db(datasource, self) + -class Datasource(Model, AuditMixin): +class Datasource(Model, AuditMixin, Queryable): __tablename__ = 'datasources' id = Column(Integer, primary_key=True) datasource_name = Column(String(256), unique=True) @@ -67,6 +268,10 @@ def metrics_combo(self): [(m.metric_name, m.verbose_name) for m in self.metrics], key=lambda x: x[1]) + @property + def name(self): + return self.datasource_name + def __repr__(self): return self.datasource_name @@ -130,17 +335,102 @@ def sync_to_db(cls, name, cluster): col_obj.generate_metrics() #session.commit() - @property - def column_names(self): - return sorted([c.column_name for c in self.columns]) + def query( + self, groupby, metrics, + granularity, + from_dttm, to_dttm, + limit_spec=None, + filter=None, + is_timeseries=True, + timeseries_limit=15, row_limit=None): - @property - def groupby_column_names(self): - return sorted([c.column_name for c in self.columns if c.groupby]) + aggregations = { + m.metric_name: m.json_obj + for m in self.metrics if m.metric_name in metrics + } + if not isinstance(granularity, basestring): + granularity = {"type": "duration", "duration": granularity} - @property - def filterable_column_names(self): - return sorted([c.column_name for c in self.columns if c.filterable]) + qry = dict( + datasource=self.datasource_name, + dimensions=groupby, + aggregations=aggregations, + granularity=granularity, + intervals= from_dttm.isoformat() + '/' + to_dttm.isoformat(), + ) + filters = None + for col, op, eq in filter: + cond = None + if op == '==': + cond = Dimension(col)==eq + elif op == '!=': + cond = ~(Dimension(col)==eq) + elif op in ('in', 'not in'): + fields = [] + splitted = eq.split(',') + if len(splitted) > 1: + for s in eq.split(','): + s = s.strip() + fields.append(Filter.build_filter(Dimension(col)==s)) + cond = Filter(type="or", fields=fields) + else: + cond = Dimension(col)==eq + if op == 'not in': + cond = ~cond + if filters: + filters = Filter(type="and", fields=[ + Filter.build_filter(cond), + Filter.build_filter(filters) + ]) + else: + filters = cond + + if filters: + qry['filter'] = filters + + client = self.cluster.get_pydruid_client() + orig_filters = filters + if timeseries_limit: + # Limit on the number of timeseries, doing a two-phases query + pre_qry = deepcopy(qry) + pre_qry['granularity'] = "all" + pre_qry['limit_spec'] = { + "type": "default", + "limit": timeseries_limit, + "columns": [{ + "dimension": metrics[0] if metrics else self.metrics[0], + "direction": "descending", + }], + } + client.groupby(**pre_qry) + df = client.export_pandas() + if not df is None and not df.empty: + dims = qry['dimensions'] + filters = [] + for index, row in df.iterrows(): + fields = [] + for dim in dims: + f = Filter.build_filter(Dimension(dim) == row[dim]) + fields.append(f) + if len(fields) > 1: + filt = Filter(type="and", fields=fields) + filters.append(Filter.build_filter(filt)) + elif fields: + filters.append(fields[0]) + + if filters: + ff = Filter(type="or", fields=filters) + if not orig_filters: + qry['filter'] = ff + else: + qry['filter'] = Filter(type="and", fields=[ + Filter.build_filter(ff), + Filter.build_filter(orig_filters)]) + qry['limit_spec'] = None + + client.groupby(**qry) + df = client.export_pandas() + return df class Metric(Model): diff --git a/app/templates/panoramix/datasource.html b/app/templates/panoramix/datasource.html index 0a53cc6543ad0..599d80efb4253 100644 --- a/app/templates/panoramix/datasource.html +++ b/app/templates/panoramix/datasource.html @@ -22,7 +22,7 @@

- {{ datasource.datasource_name }} + {{ datasource.name }} {% if datasource.description %} {% endif %} diff --git a/app/views.py b/app/views.py index 561802c1828b3..80ed7289cda15 100644 --- a/app/views.py +++ b/app/views.py @@ -27,6 +27,22 @@ def muldelete(self, items): return redirect(self.get_redirect()) +class TableColumnInlineView(CompactCRUDMixin, ModelView): + datamodel = SQLAInterface(models.TableColumn) + can_delete = False + edit_columns = [ + 'column_name', 'description', 'table', 'groupby', 'filterable', + 'count_distinct', 'sum', 'min', 'max'] + list_columns = [ + 'column_name', 'type', 'groupby', 'count_distinct', + 'sum', 'min', 'max'] + page_size = 100 + list_columns = [ + 'column_name', 'type', 'groupby', 'count_distinct', + 'sum', 'min', 'max'] +appbuilder.add_view_no_menu(TableColumnInlineView) + + class ColumnInlineView(CompactCRUDMixin, ModelView): datamodel = SQLAInterface(models.Column) edit_columns = [ @@ -46,6 +62,16 @@ def post_update(self, col): appbuilder.add_view_no_menu(ColumnInlineView) +class SqlMetricInlineView(CompactCRUDMixin, ModelView): + datamodel = SQLAInterface(models.SqlMetric) + list_columns = ['metric_name', 'verbose_name', 'metric_type' ] + edit_columns = [ + 'metric_name', 'description', 'verbose_name', 'metric_type', + 'table', 'expression'] + add_columns = edit_columns + page_size = 100 +appbuilder.add_view_no_menu(SqlMetricInlineView) + class MetricInlineView(CompactCRUDMixin, ModelView): datamodel = SQLAInterface(models.Metric) @@ -80,6 +106,39 @@ class ClusterModelView(ModelView, DeleteMixin): category_icon='fa-cogs',) +class DatabaseView(ModelView, DeleteMixin): + datamodel = SQLAInterface(models.Database) + list_columns = ['database_name'] + add_columns = ['database_name', 'sqlalchemy_uri'] + edit_columns = add_columns + +appbuilder.add_view( + DatabaseView, + "Databases", + icon="fa-database", + category="Admin", + category_icon='fa-cogs',) + + +class TableView(ModelView, DeleteMixin): + datamodel = SQLAInterface(models.Table) + list_columns = ['table_link', 'database'] + add_columns = ['table_name', 'database', 'default_endpoint'] + edit_columns = add_columns + related_views = [TableColumnInlineView, SqlMetricInlineView] + + def post_insert(self, table): + table.fetch_metadata() + + def post_update(self, table): + table.fetch_metadata() + +appbuilder.add_view( + TableView, + "Tables", + icon='fa-table',) + + class DatasourceModelView(ModelView, DeleteMixin): datamodel = SQLAInterface(models.Datasource) list_columns = [ @@ -101,8 +160,7 @@ def post_update(self, datasource): appbuilder.add_view( DatasourceModelView, "Druid Datasources", - icon="fa-cube", - category_icon='fa-envelope') + icon="fa-cube") @app.route('/health') @@ -116,6 +174,34 @@ def ping(): class Panoramix(BaseView): + @has_access + @permission_name('tables') + @expose("/table//") + def table(self, table_id): + + table = ( + db.session + .query(models.Table) + .filter_by(id=table_id) + .first() + ) + viz_type = request.args.get("viz_type") + if not viz_type and table.default_endpoint: + return redirect(table.default_endpoint) + if not viz_type: + viz_type = "table" + obj = viz.viz_types[viz_type]( + table, + form_data=request.args, view=self) + if request.args.get("json"): + return Response( + json.dumps(obj.get_query(), indent=4), + status=200, + mimetype="application/json") + if obj.df is None or obj.df.empty: + return obj.render_no_data() + return obj.render() + @has_access @permission_name('datasources') @expose("/datasource//") diff --git a/app/viz.py b/app/viz.py index 02212639565fb..42c9570b4710a 100644 --- a/app/viz.py +++ b/app/viz.py @@ -1,4 +1,3 @@ -from pydruid.utils.filters import Dimension, Filter from datetime import datetime from flask import flash, request import pandas as pd @@ -7,6 +6,7 @@ from app.highchart import Highchart from wtforms import Form, SelectMultipleField, SelectField, TextField import config +from pydruid.utils.filters import Dimension, Filter CHART_ARGS = { @@ -15,6 +15,7 @@ 'target_div': 'chart', } + class OmgWtForm(Form): field_order = ( 'viz_type', 'granularity', 'since', 'group_by', 'limit') @@ -79,7 +80,8 @@ def __init__(self, datasource, form_data, view): self.df = self.bake_query() self.view = view if self.df is not None: - self.df.timestamp = pd.to_datetime(self.df.timestamp) + if 'timestamp' in self.df.columns: + self.df.timestamp = pd.to_datetime(self.df.timestamp) self.df_prep() self.form_prep() @@ -89,91 +91,49 @@ def form_class(self): def query_filters(self): args = self.form_data # Building filters - filters = None + filters = [] for i in range(1, 10): col = args.get("flt_col_" + str(i)) op = args.get("flt_op_" + str(i)) eq = args.get("flt_eq_" + str(i)) if col and op and eq: - cond = None - if op == '==': - cond = Dimension(col)==eq - elif op == '!=': - cond = ~(Dimension(col)==eq) - elif op in ('in', 'not in'): - fields = [] - splitted = eq.split(',') - if len(splitted) > 1: - for s in eq.split(','): - s = s.strip() - fields.append(Filter.build_filter(Dimension(col)==s)) - cond = Filter(type="or", fields=fields) - else: - cond = Dimension(col)==eq - if op == 'not in': - cond = ~cond - if filters: - filters = Filter(type="and", fields=[ - Filter.build_filter(cond), - Filter.build_filter(filters) - ]) - else: - filters = cond + filters.append((col, op, eq)) return filters + def bake_query(self): + return self.datasource.query(**self.query_obj()) + def query_obj(self): ds = self.datasource args = self.form_data groupby = args.getlist("groupby") or [] + metrics = args.getlist("metrics") or ['count'] granularity = args.get("granularity", "1 day") - granularity = utils.parse_human_timedelta(granularity).total_seconds() * 1000 - aggregations = { - m.metric_name: m.json_obj - for m in ds.metrics if m.metric_name in self.metrics - } + granularity = utils.parse_human_timedelta( + granularity).total_seconds() * 1000 limit = int( args.get("limit", config.ROW_LIMIT)) or config.ROW_LIMIT since = args.get("since", "1 year ago") from_dttm = utils.parse_human_datetime(since) if from_dttm > datetime.now(): from_dttm = datetime.now() - (from_dttm-datetime.now()) - from_dttm = from_dttm.isoformat() until = args.get("until", "now") - to_dttm = utils.parse_human_datetime(until).isoformat() + to_dttm = utils.parse_human_datetime(until) if from_dttm >= to_dttm: flash("The date range doesn't seem right.", "danger") from_dttm = to_dttm # Making them identicial to not raise d = { - 'datasource': ds.datasource_name, - 'granularity': {"type": "duration", "duration": granularity}, - 'intervals': from_dttm + '/' + to_dttm, - 'dimensions': groupby, - 'aggregations': aggregations, - 'limit_spec': { - "type": "default", - "limit": limit, - "columns": [{ - "dimension": self.metrics[0], - "direction": "descending", - }], - }, + 'granularity': granularity, + 'from_dttm': from_dttm, + 'to_dttm': to_dttm, + 'groupby': groupby, + 'metrics': metrics, + 'filter': self.query_filters(), + 'timeseries_limit': limit, } - filters = self.query_filters() - if filters: - d['filter'] = filters return d - def bake_query(self): - client = self.datasource.cluster.get_pydruid_client() - client.groupby(**self.query_obj()) - return client.export_pandas() - - def get_query(self): - client = self.datasource.cluster.get_pydruid_client() - client.groupby(**self.query_obj()) - return client.query_dict - - def df_prep(self, ): + def df_prep(self): pass def form_prep(self): @@ -265,37 +225,8 @@ def bake_query(self): """ Doing a 2 phase query where we limit the number of series. """ - client = self.datasource.cluster.get_pydruid_client() qry = self.query_obj() - orig_filter = qry['filter'] if 'filter' in qry else '' - qry['granularity'] = "all" - client.groupby(**qry) - df = client.export_pandas() - if not df is None: - dims = qry['dimensions'] - filters = [] - for index, row in df.iterrows(): - fields = [] - for dim in dims: - f = Filter.build_filter(Dimension(dim) == row[dim]) - fields.append(f) - if len(fields) > 1: - filters.append(Filter.build_filter(Filter(type="and", fields=fields))) - elif fields: - filters.append(fields[0]) - - qry = self.query_obj() - if filters: - ff = Filter(type="or", fields=filters) - if not orig_filter: - qry['filter'] = ff - else: - qry['filter'] = Filter(type="and", fields=[ - Filter.build_filter(ff), - Filter.build_filter(orig_filter)]) - del qry['limit_spec'] - client.groupby(**qry) - return client.export_pandas() + return self.datasource.query(**qry) class TimeSeriesCompareViz(TimeSeriesViz): verbose_name = "Time Series - Percent Change"