diff --git a/modules/base/entities/referer.php b/modules/base/entities/referer.php index 7a3201fc..1838ac88 100644 --- a/modules/base/entities/referer.php +++ b/modules/base/entities/referer.php @@ -56,53 +56,40 @@ function __construct() { $this->properties['is_searchengine']->setDataType(OWA_DTD_TINYINT); } - public function crawlReferer() - { - if (!owa_coreAPI::getSetting( 'base', 'fetch_refering_page_info')) { - return; - } - - $crawler = new owa_http; - $res = $crawler->getRequest($this->get('url')); - - $title = trim($crawler->extract_title()); - - if ($title) { - $this->set('page_title', owa_lib::utf8Encode($title)); - } - - $se = $this->get('is_searchengine'); - - if ($se == true) { - return; - } - - $medium = $this->get('medium'); - - if ( $medium === 'organic-search' || $medium === 'social-network' ) { + public function crawlReferer() { + + if ( owa_coreAPI::getSetting( 'base', 'fetch_refering_page_info')) { + + // never crawl search engines or social netowrks. + $medium = $this->get('medium'); - return; - } - - //Extract anchortext and page snippet but not if it's a search engine... - $snippet = $crawler->extract_anchor_snippet($this->get('url')); - - if ($snippet) { - if (function_exists('iconv')) { - $snippet = iconv('UTF-8','UTF-8//TRANSLIT',$snippet); - } - $this->set('snippet', $snippet); - } - - if (!$crawler->anchor_info) { - return; - } - - $anchortext = $crawler->anchor_info['anchor_text']; - - if ($anchortext) { - $this->set('refering_anchortext', owa_lib::utf8Encode($anchortext)); - } + if ( $medium === 'organic-search' || $this->get('is_searchengine') || $medium === 'social-network' ) { + + return; + } + + $crawler = new owa_http; + $res = $crawler->getRequest($this->get('url')); + + // extract title + $title = $crawler->extract_title(); + + if ($title) { + + $this->set( 'page_title', owa_lib::utf8Encode( $title ) ); + } + + //Extract anchortext and page snippet but not if it's a search engine... + $anchortext = $crawler->extractAnchorText( $this->get('url') ); + + if ( $anchortext ) { + + $anchortext = owa_lib::utf8Encode($anchortext ); + + $this->set( 'snippet', $anchortext ); + $this->set( 'refering_anchortext', $anchortext ); + } + } } } diff --git a/owa_httpRequest.php b/owa_httpRequest.php index e221a471..953fc2cd 100644 --- a/owa_httpRequest.php +++ b/owa_httpRequest.php @@ -94,136 +94,45 @@ function __construct() { * * @param string $link */ - function extract_anchor($link) { + function extractAnchors() { $regex = '/]*href\s*=\s*([\"\']??)(http|https[^\\1 >]*?)\\1[^>]*>s*(.*)<\/a>/simU'; if( preg_match_all("$regex", $this->getResponseBody(), $matches, PREG_SET_ORDER ) ) { - owa_coreAPI::debug( 'TEST Found anchor info: ' . print_r( $matches, true ) ); + owa_coreAPI::debug( 'Found anchors: ' . print_r( $matches, true ) ); - foreach($matches as $match) { - // $match[2] = link address - // $match[3] = link text - - if ($match[3] && $link === $match[3] ) { - $this->anchor_info = array('anchor_tag' => match[0], 'anchor_text' => owa_lib::inputFilter( owa_sanitize::stripAllTags( $match[3] ) )); - owa_coreAPI::debug('Anchor info: '.print_r($this->anchor_info, true)); - } - } + return $matches } - - return; - - $matches = ''; - $regex = '/]*href=\"%s\"[^>]*>(.*?)<\/a>/i'; - - //$escaped_link = str_replace(array("/", "?"), array("\/", "\?"), $link); - - $pattern = trim(sprintf($regex, preg_quote($link, '/'))); - $search = preg_match($pattern, $this->getResponseBody(), $matches); - //owa_coreAPI::debug('pattern: '.$pattern); - //owa_coreAPI::debug('link: '.$link); - - - if (empty($matches)) { - if (substr($link, -1) === '/') { - $link = substr($link, 0, -1); - $pattern = trim(sprintf($regex, preg_quote($link, '/'))); - $search = preg_match($pattern, $this->getResponseBody(), $matches); - //owa_coreAPI::debug('pattern: '.$pattern); - //owa_coreAPI::debug('link: '.$link); - } - } - - owa_coreAPI::debug('ref search: '.$search); - //owa_coreAPI::debug('ref matches: '.print_r($this->results, true)); - //owa_coreAPI::debug('ref matches: '.print_r($matches, true)); - if (isset($matches[0])) { - $this->anchor_info = array('anchor_tag' => $matches[0], 'anchor_text' => owa_lib::inputFilter($matches[0])); - owa_coreAPI::debug('Anchor info: '.print_r($this->anchor_info, true)); - } } - - /** - * Creates a text snippet of the portion of page where the - * specific link is found. - * - * Takes fully qualified URL for the link to search for. - * - * @param string $link - * @return string - */ - function extract_anchor_snippet($link){ - - // Search the page for a specific anchor - $this->extract_anchor($link); - - if (array_key_exists( 'anchor_text', $this->anchor_info ) ) { + + function extractAnchorText( $url ) { + + $anchors = $this->extractAnchors(); + + $anchortext = ''; + + foreach( $anchors as $match) { + // match[0] = full matching tag + // $match[2] = link address + // $match[3] = link text - return $this->anchor_info['anchor_text']; - } else { + //strip any HTML tags (i.e. img, span, etc) + if ( $match[3] ) { + + $match[3] = trim( owa_sanitize::stripAllTags( $match[3] ) ); + } - return; - } - - if(!empty($this->anchor_info['anchor_tag'])) { - - // drop certain HTML entitities and their content - $nohtml = $this->strip_selected_tags( - $this->getResponseBody(), - array('title', - 'head', - 'script', - 'object', - 'style', - 'meta', - 'link', - 'rdf:'), - true); - - //owa_coreAPI::debug('Refering page content after certain html entities were dropped: '.$this->results); - - // calc len of the anchor text - $atext_len = strlen($this->anchor_info['anchor_tag']); - - // find position within document of the anchor text - $start = strpos($nohtml, $this->anchor_info['anchor_tag']); - - if ($start < $this->snip_len) { - $part1_start_pos = 0; - $part1_snip_len = $start; - } else { - $part1_start_pos = $start; - $part1_snip_len = $this->snip_len; - } - - $replace_items = array("\r\n", "\n\n", "\t", "\r", "\n"); - // Create first segment of snippet - $first_part = substr($nohtml, 0, $part1_start_pos); - $first_part = str_replace($replace_items, '', $first_part); - $first_part = strip_tags(owa_lib::inputFilter($first_part)); - //$part1 = trim(substr($nohtml, $part1_start_pos, $part1_snip_len)); - $part1 = substr($first_part,-$part1_snip_len, $part1_snip_len); - - //$part1 = str_replace(array('\r\n', '\n\n', '\t', '\r', '\n'), '', $part1); - //$part1 = owa_lib::inputFilter($part1); - // Create second segment of snippet - $part2 = trim(substr($nohtml, $start + $atext_len, $this->snip_len+300)); - $part2 = str_replace($replace_items, '', $part2); - $part2 = substr(strip_tags(owa_lib::inputFilter($part2)),0, $this->snip_len); - - // Put humpty dumpy back together again and create actual snippet - $snippet = $this->snip_str.$part1.' '.owa_lib::inputFilter($this->anchor_info['anchor_tag']).' '.$part2.$this->snip_str; - - } else { - - $snippet = ''; - - } - - return $snippet; - + // if anything is left as anchortext then use that + if ( $match[3] && $url === $match[2] ) { + + $anchortext = $match[3]; + + owa_coreAPI::debug('Anchor info: '.print_r($this->anchor_info, true)); + + return owa_lib::inputFilter( $anchotext ); + } + } } function extract_title() { @@ -238,7 +147,7 @@ function extract_title() { owa_coreAPI::debug("referrer title extract: ". print_r($title, true)); - return $title; + return trim($title); } function strip_selected_tags($str, $tags = array(), $stripContent = false) {