From 085df7e2be04d9ded535f716cfc52c3091d9235f Mon Sep 17 00:00:00 2001 From: Peter Adams Date: Fri, 28 Jan 2022 15:10:49 -0800 Subject: [PATCH] Move traffic attribution inference to server side (#812) * removing traffic attribution inference to server side. * moving config arrays to thier own files. * adding ability ot supplement the search engine and social metwork lists with supplimental files. * moving conf file loading to coreAPI. * fixing missing namespace. Co-authored-by: padams --- conf/searchengines.php | 52 ++++ conf/socialnetworks.php | 13 + modules/base/classes/trackingEventHelpers.php | 233 +++++++++++++++++- modules/base/module.php | 28 ++- modules/base/src/tracker/Tracker.js | 148 +---------- owa_coreAPI.php | 32 ++- owa_env.php | 1 + owa_requestContainer.php | 5 +- 8 files changed, 358 insertions(+), 154 deletions(-) create mode 100644 conf/searchengines.php create mode 100644 conf/socialnetworks.php diff --git a/conf/searchengines.php b/conf/searchengines.php new file mode 100644 index 00000000..b0fec72b --- /dev/null +++ b/conf/searchengines.php @@ -0,0 +1,52 @@ + 'google', 'query_param' =>'q'], + ['domain' => 'yahoo', 'query_param' => 'p'], + ['domain' => 'msn', 'query_param' => 'q'], + ['domain' => 'bing', 'query_param' => 'q'], + ['domain' => 'images.google', 'query_param' => 'q'], + ['domain' => 'images.search.yahoo.com', 'query_param' => 'p'], + ['domain' => 'aol', 'query_param' => 'query'], + ['domain' => 'aol', 'query_param' => 'encquery'], + ['domain' => 'aol', 'query_param' => 'q'], + ['domain' => 'lycos', 'query_param' => 'query'], + ['domain' => 'ask', 'query_param' => 'q'], + ['domain' => 'altavista', 'query_param' => 'q'], + ['domain' => 'netscape', 'query_param' => 'query'], + ['domain' => 'cnn', 'query_param' => 'query'], + ['domain' => 'about', 'query_param' => 'terms'], + ['domain' => 'mamma', 'query_param' => 'q'], + ['domain' => 'daum', 'query_param' => 'q'], + ['domain' => 'eniro', 'query_param' => 'search_word'], + ['domain' => 'naver', 'query_param' => 'query'], + ['domain' => 'pchome', 'query_param' => 'q'], + ['domain' => 'alltheweb', 'query_param' => 'q'], + ['domain' => 'voila', 'query_param' => 'rdata'], + ['domain' => 'virgilio', 'query_param' => 'qs'], + ['domain' => 'live', 'query_param' => 'q'], + ['domain' => 'baidu', 'query_param' => 'wd'], + ['domain' => 'alice', 'query_param' => 'qs'], + ['domain' => 'yandex', 'query_param' => 'text'], + ['domain' => 'najdi', 'query_param' => 'q'], + ['domain' => 'mama', 'query_param' => 'query'], + ['domain' => 'seznam', 'query_param' => 'q'], + ['domain' => 'search', 'query_param' => 'q'], + ['domain' => 'wp', 'query_param' => 'szukaj'], + ['domain' => 'onet', 'query_param' => 'qt'], + ['domain' => 'szukacz', 'query_param' => 'q'], + ['domain' => 'yam', 'query_param' => 'k'], + ['domain' => 'kvasir', 'query_param' => 'q'], + ['domain' => 'sesam', 'query_param' => 'q'], + ['domain' => 'ozu', 'query_param' => 'q'], + ['domain' => 'terra', 'query_param' => 'query'], + ['domain' => 'mynet', 'query_param' => 'q'], + ['domain' => 'ekolay', 'query_param' => 'q'], + ['domain' => 'rambler', 'query_param' => 'query'], + ['domain' => 'rambler', 'query_param' => 'words'], + ['domain' => 'duckduckgo', 'query_param' => 'q'] +]; + + +?> \ No newline at end of file diff --git a/conf/socialnetworks.php b/conf/socialnetworks.php new file mode 100644 index 00000000..167c4a40 --- /dev/null +++ b/conf/socialnetworks.php @@ -0,0 +1,13 @@ + 'facebook'], + ['domain' => 'twitter'], + ['domain' => 'pinterest'], + ['domain' => 'instagram'], + ['domain' => 'linkedin'], + ['domain' => 't.co'] +]; + +?> \ No newline at end of file diff --git a/modules/base/classes/trackingEventHelpers.php b/modules/base/classes/trackingEventHelpers.php index 50f1f0c8..53bbf2fc 100644 --- a/modules/base/classes/trackingEventHelpers.php +++ b/modules/base/classes/trackingEventHelpers.php @@ -450,6 +450,234 @@ static function derivePageUri( $page_uri, $event ) { return $page_parse['path'] ; } + } + + static function deriveMedium( $medium, $event ) { + + // respect what was already set by the tracker + if ( $medium ) { + + return $medium; + } + + if ( $event->get( 'session_referer' ) ) { + + // check for referrer url + $ref = $event->get('session_referer'); + + if ( $ref ) { + + // parse the referrer url + $uri = self::parse_url( $ref ); + + $host = $uri['host']; + + $medium = 'referral'; + + // check if referral is a search engine + $engine = self::isSearchEngine( $host ); + + if ( $engine ) { + + $medium = 'organic-search'; + } + + if ( ! $engine ) { + + // check if referral is a social network + $network = self::issocialNetwork( $host ); + + if ( $network ) { + + $medium = 'social-network'; + } + } + } + + return $medium; + } + } + + /** + * Use this function to parse out the url and query array element from + * a url. + */ + public static function parse_url( $url ) { + + $url = parse_url($url); + + if ( isset( $url['query'] ) ) { + $var = $url['query']; + + $var = html_entity_decode($var); + $var = explode('&', $var); + $arr = array(); + + foreach( $var as $val ) { + + if ( strpos($val, '=') ) { + $x = explode('=', $val); + + if ( isset( $x[1] ) ) { + $arr[$x[0]] = urldecode($x[1]); + } + } else { + $arr[$val] = ''; + } + } + unset($val, $x, $var); + + $url['query_params'] = $arr; + + } + + return $url; + } + + + static function deriveSource( $source, $event ) { + + // respect what was already set by the tracker + if ( $source ) { + + return $source; + } + + + if ( $event->get( 'session_referer' ) ) { + + $ref = $event->get( 'session_referer' ); + $uri = self::parse_url( $ref ); + + $host = $uri['host']; + + if ($host) { + + $source = self::stripWwwFromDomain( $host ); + return $source; + } + } + } + + static function stripWWWFromDomain( $domain ) { + + $done = false; + $part = substr( $domain, 0, 5 ); + if ($part === '.www.') { + //strip .www. + $domain = substr( $domain, 5); + // add back the leading period + $domain = '.'.$domain; + $done = true; + } + + if ( ! $done ) { + $part = substr( $domain, 0, 4 ); + if ($part === 'www.') { + //strip .www. + $domain = substr( $domain, 4); + $done = true; + } + + } + + return $domain; + } + + static function isSearchEngine( $host ) { + + if ( ! $host ) { + + return; + } + + $searchEngine = []; + + $organicSearchEngines = self::getSearchEngineList(); + + foreach ( $organicSearchEngines as $engine ) { + + $domain = $engine['domain']; + + if ( strpos( $host, $domain ) ) { + + owa_coreAPI::debug( 'Found search engine: '. $domain); + + return true; + } + } + } + + static function extractSearchTerm( $term, $event ) { + + if ( $term ) { + + return $term; + } + + if ( $event->get( 'session_referer' ) ) { + + // check for referrer url + $ref = $event->get( 'session_referer' ); + + $uri = self::parse_url( $ref ); + owa_coreAPI::debug($uri); + // check for query params, search engine might have sent them under https + if ( array_key_exists('query_params', $uri) && ! empty( $uri['query_params'] ) ) { + + $host = $uri['host']; + + $organicSearchEngines = self::getSearchEngineList(); + + foreach ( $organicSearchEngines as $engine ) { + + $domain = $engine['domain']; + + if ( strpos( $host, $domain) ) { + + $query_param = $engine['query_param']; + $term = ''; + + if (isset($uri['query_params'][$query_param])) { + + $term = $uri['query_params'][$query_param]; + owa_coreAPI::debug( 'Found search term: ' . $term); + + } else { + + $term = '(not provided)'; + } + // need urldecode here ot clean up the "+" characters in the term + return trim( urldecode( strtolower( $term ) ) ); + } + } + } + } + } + + static function isSocialNetwork( $host ) { + + $social_networks = self::getSocialNetworkList(); + + foreach ( $social_networks as $network ) { + + if ( strpos( $host, $network['domain'] ) ) { + + owa_coreAPI::debug( 'Found social network: %s', $network['domain'] ); + + return true; + } + } + } + + static function getSearchEngineList() { + + return owa_coreAPI::loadConf( 'searchengines.php', 'tracking.search_engine_registry' ); + } + + static function getSocialNetworkList() { + + return owa_coreAPI::loadConf( 'socialnetworks.php', 'tracking.social_network_registry' ); } /** @@ -592,7 +820,7 @@ static function utfEncodeProperty( $string, $event ) { static function resolveFullHost( $full_host, $event ) { // See if host is already resolved - if ( $event->get('REMOTE_HOST') === '(not set)' + if ( ( ! $event->get('REMOTE_HOST') || $event->get('REMOTE_HOST') === '(not set)' ) && $event->get( 'ip_address' ) && owa_coreAPI::getSetting('base', 'resolve_hosts') ) { @@ -614,7 +842,7 @@ static function resolveFullHost( $full_host, $event ) { } } else { - + $remote_host = @gethostbyaddr( $ip_address ); } @@ -701,6 +929,7 @@ static function resolveOs ( $os, $event ) { } static function resolveEntryPage( $is_entry_page, $event ) { + return $event->get('is_new_session') ? true : false; } diff --git a/modules/base/module.php b/modules/base/module.php index 937bb0a3..6d052399 100644 --- a/modules/base/module.php +++ b/modules/base/module.php @@ -225,30 +225,32 @@ public function setupTrackingProperties() { 'data_type' => 'url', 'callbacks' => array( 'owa_trackingEventHelpers::makeUrlCanonical' ) ), - + + 'session_referer' => array( + 'required' => false, + 'data_type' => 'url', + 'callbacks' => array() + ), + // must come after session_referer 'source' => array( 'required' => true, 'data_type' => 'string', - 'callbacks' => array( 'owa_trackingEventHelpers::lowercaseString' ), + 'callbacks' => array( 'owa_trackingEventHelpers::lowercaseString', 'owa_trackingEventHelpers::deriveSource' ), 'default_value' => '(not set)' ), - + // must come after session_referer 'medium' => array( 'required' => true, 'data_type' => 'string', - 'callbacks' => array( 'owa_trackingEventHelpers::lowercaseString' ), - 'default_value' => '(not set)' - ), - - 'session_referer' => array( - 'required' => false, - 'data_type' => 'url', - 'callbacks' => array() + 'callbacks' => array( 'owa_trackingEventHelpers::lowercaseString', 'owa_trackingEventHelpers::deriveMedium' ), + 'default_value' => 'direct' ), + + // must come after session_referer // @todo investigate if this should be a required property so that a proper join can occur. 'search_terms' => array( - 'required' => false, - 'callbacks' => array( 'owa_trackingEventHelpers::setSearchTerms' ), + 'required' => true, + 'callbacks' => array( 'owa_trackingEventHelpers::extractSearchTerm' ), 'default_value' => '(not set)' ), diff --git a/modules/base/src/tracker/Tracker.js b/modules/base/src/tracker/Tracker.js index 600a5d24..1e01c5d7 100644 --- a/modules/base/src/tracker/Tracker.js +++ b/modules/base/src/tracker/Tracker.js @@ -40,59 +40,7 @@ class OWATracker { this.isTrafficAttributed = false; this.linkedStateSet = false; this.hashCookiesToDomain = true; - this.organicSearchEngines = [ - {d: 'google', q: 'q'}, - {d: 'yahoo', q: 'p'}, - {d: 'yahoo', q: 'q'}, - {d: 'msn', q: 'q'}, - {d: 'bing', q: 'q'}, - {d: 'images.google', q: 'q'}, - {d: 'images.search.yahoo.com', q: 'p'}, - {d: 'aol', q: 'query'}, - {d: 'aol', q: 'encquery'}, - {d: 'aol', q: 'q'}, - {d: 'lycos', q: 'query'}, - {d: 'ask', q: 'q'}, - {d: 'altavista', q: 'q'}, - {d: 'netscape', q: 'query'}, - {d: 'cnn', q: 'query'}, - {d: 'about', q: 'terms'}, - {d: 'mamma', q: 'q'}, - {d: 'daum', q: 'q'}, - {d: 'eniro', q: 'search_word'}, - {d: 'naver', q: 'query'}, - {d: 'pchome', q: 'q'}, - {d: 'alltheweb', q: 'q'}, - {d: 'voila', q: 'rdata'}, - {d: 'virgilio', q: 'qs'}, - {d: 'live', q: 'q'}, - {d: 'baidu', q: 'wd'}, - {d: 'alice', q: 'qs'}, - {d: 'yandex', q: 'text'}, - {d: 'najdi', q: 'q'}, - {d: 'mama', q: 'query'}, - {d: 'seznam', q: 'q'}, - {d: 'search', q: 'q'}, - {d: 'wp', q: 'szukaj'}, - {d: 'onet', q: 'qt'}, - {d: 'szukacz', q: 'q'}, - {d: 'yam', q: 'k'}, - {d: 'kvasir', q: 'q'}, - {d: 'sesam', q: 'q'}, - {d: 'ozu', q: 'q'}, - {d: 'terra', q: 'query'}, - {d: 'mynet', q: 'q'}, - {d: 'ekolay', q: 'q'}, - {d: 'rambler', q: 'query'}, - {d: 'rambler', q: 'words'}, - {d: 'duckduckgo', q: 'q'}, - ]; - - this.socialNetworks = [ - - 'facebook', 'twitter', 'pinterest', 'instagram', 'linkedin', 't.co' - ]; - + /** * GET params parsed from URL */ @@ -1321,7 +1269,8 @@ class OWATracker { var campaign_params = {}; for (var i = 0, n = campaignKeys.length; i < n; i++) { - + + // anytime we see a campaign param on the URL its a new campaign. if ( this.urlParams.hasOwnProperty(campaignKeys[i].public) ) { campaign_params[campaignKeys[i].private] = this.urlParams[campaignKeys[i].public]; @@ -1492,9 +1441,12 @@ class OWATracker { } else { // infer the attribution from the referer // if the request is the start of a new session + if ( this.isNewSessionFlag === true ) { + var ref = document.referrer; + OWA.setState( 's', 'referer', ref ); OWA.debug( 'Infering traffic attribution.' ); - this.inferTrafficAttribution(); + } } @@ -1510,6 +1462,7 @@ class OWATracker { } // set sesion referer + // @todo move this logic to service side. not really needed in tracker as we already send HTTTP_REFERER var session_referer = OWA.getState('s', 'referer'); if ( session_referer ) { @@ -1529,50 +1482,7 @@ class OWATracker { } } - inferTrafficAttribution() { - - var ref = document.referrer; - var medium = 'direct'; - var source = '(none)'; - var search_terms = '(none)'; - var session_referer = '(none)'; - - if ( ref ) { - - var uri = new Uri( ref ); - - // check for external referer - if ( document.domain != uri.getHost() ) { - - medium = 'referral'; - session_referer = ref; - source = Util.stripWwwFromDomain( uri.getHost() ); - - // check for search engine - var engine = this.isRefererSearchEngine( uri ); - - if ( engine ) { - - medium = 'organic-search'; - search_terms = engine.t || '(not provided)'; - - } else { - // check for social network - var social = this.isRefererSocialNetwork( uri ); - - if ( social ) { - - medium = 'social-network'; - } - } - } - } - OWA.setState( 's', 'referer', session_referer ); - OWA.setState( 's', 'medium', medium ); - OWA.setState( 's', 'source', source ); - OWA.setState( 's', 'search_terms', search_terms ); - } setCampaignCookie( values ) { @@ -1580,47 +1490,13 @@ class OWATracker { OWA.setState( 'c', 'attribs', values, '', 'json', this.options.campaignAttributionWindow ); } - isRefererSocialNetwork( uri ) { - - for ( var i = 0, n = this.socialNetworks.length; i < n; i++ ) { - - var domain = this.socialNetworks[i]; - var host = uri.getHost(); - - if ( Util.strpos(host, domain) ) { - - OWA.debug( 'Found social network: %s', domain); - - return domain; - } - } - } - - isRefererSearchEngine( uri ) { - - for ( var i = 0, n = this.organicSearchEngines.length; i < n; i++ ) { - - var domain = this.organicSearchEngines[i].d; - var query_param = this.organicSearchEngines[i].q - var host = uri.getHost(); - var term = uri.getQueryParam(query_param); - - if ( Util.strpos(host, domain) ) { - OWA.debug( 'Found search engine: %s with query param %s:, query term: %s', domain, query_param, term); - - return {d: domain, q: query_param, t: term }; - } - } - } + /** + * DEPRICATED. Functionality moved to server side. + */ addOrganicSearchEngine( domain, query_param, prepend) { - var engine = {d: domain, q: query_param}; - if (prepend) { - this.organicSearchEngines.unshift(engine); - } else { - this.organicSearchEngines.push(engine); - } + return; } addTransaction( order_id, order_source, total, tax, shipping, gateway, city, state, country ) { diff --git a/owa_coreAPI.php b/owa_coreAPI.php index dfa56d13..39e117ba 100644 --- a/owa_coreAPI.php +++ b/owa_coreAPI.php @@ -979,7 +979,7 @@ public static function logEvent( $event_type, $message = '') { return false; } - + // queue for later or process event straight away if ( owa_coreAPI::getSetting( 'base', 'queue_events' ) || owa_coreAPI::getSetting( 'base', 'queue_incoming_tracking_events' ) ) { @@ -1849,6 +1849,36 @@ public static function isIpAddressExcluded( $ip_address ) { } } } + + static function loadConf( $file_name, $filter_name = '' ) { + + $conf_file = OWA_CONF_DIR . $file_name; + + if ( file_exists( $conf_file ) ) { + + $conf = include( $conf_file); + } + + $sup_file = OWA_DATA_DIR . $file_name; + + if ( file_exists( $sup_file ) ) { + + $sup_conf = include( $sup_file ); + + if ( is_array( $sup_conf) ) { + + $conf = array_merge( $conf, $sup_conf ); + } + } + + // see generic filter name for filtering the final conf array + if ( ! $filter_name ) { + + $filter_name = 'conf.' . $file_name; + } + + return owa_coreAPI::filter( $filter_name, $conf ); + } /** * Attaches an event handler to the event queue diff --git a/owa_env.php b/owa_env.php index 83be4f38..81aee109 100644 --- a/owa_env.php +++ b/owa_env.php @@ -32,6 +32,7 @@ define('OWA_PATH', dirname(__FILE__)); } define('OWA_DIR', OWA_PATH . '/'); +define('OWA_DATA_DIR', OWA_DIR . 'owa-data/'); define('OWA_MODULES_DIR', OWA_DIR.'modules/'); define('OWA_BASE_DIR', OWA_PATH); // depricated define('OWA_BASE_CLASSES_DIR', OWA_DIR); //depricated diff --git a/owa_requestContainer.php b/owa_requestContainer.php index 725402ce..32fe1e04 100644 --- a/owa_requestContainer.php +++ b/owa_requestContainer.php @@ -278,7 +278,7 @@ function decodeRequestParams() { array_walk_recursive($v, array($this, 'arrayUrlDecode')); $params[$k] = $v; } else { - $params[$k] = urldecode($v); + $params[$k] = rawurldecode($v); } } @@ -293,7 +293,8 @@ function decodeRequestParams() { } function arrayUrlDecode(&$val, $index) { - urldecode($val); + + rawurldecode($val); } function getOwaCookie($name) {