diff --git a/.editorconfig b/.editorconfig index 5a1be3b..eca7ae8 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,15 +1,24 @@ root = true [*] +charset = utf-8 end_of_line = lf insert_final_newline = true - -[*.php] indent_style = space -indent_size = 4 -charset = utf-8 +indent_size = 2 trim_trailing_whitespace = true -[{composer.json,.travis.yml}] +[*.md] +trim_trailing_whitespace = false + +[{package.json,.babelrc}] indent_style = space indent_size = 2 + +[*.{js,scss,vue,yml}] +indent_style = space +indent_size = 2 + +[*.{php,json,conf}] +indent_style = space +indent_size = 4 diff --git a/.travis.yml b/.travis.yml index bd58130..547e2f1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: php php: - - '7.0' - '7.1' - '7.2' - '7.3' diff --git a/README.md b/README.md index e17d279..231ca87 100644 --- a/README.md +++ b/README.md @@ -19,13 +19,13 @@ You can use this for categorizing any text content into any arbitrary set of **c ## Installing ``` -composer install niiknow/bayes +composer install Niiknow/bayes ``` ## Usage ```php -$classifier = new \niiknow\Bayes(); +$classifier = new \Niiknow\Bayes(); // teach it positive phrases @@ -51,7 +51,7 @@ $classifier->fromJson($stateJson); ## API -### `$classifier = new \niiknow\Bayes([options])` +### `$classifier = new \Niiknow\Bayes([options])` Returns an instance of a Naive-Bayes Classifier. diff --git a/composer.json b/composer.json index ff9d432..485c6eb 100755 --- a/composer.json +++ b/composer.json @@ -33,7 +33,7 @@ }, "autoload": { "psr-4": { - "niiknow\\": "src/" + "Niiknow\\": "src/" } }, "autoload-dev": { diff --git a/phpcs.xml b/phpcs.xml new file mode 100644 index 0000000..a0ff1a2 --- /dev/null +++ b/phpcs.xml @@ -0,0 +1,107 @@ + + + + Coding Standards + + + src + + + */tests/* + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/phpunit.xml.dist b/phpunit.xml similarity index 92% rename from phpunit.xml.dist rename to phpunit.xml index 67fe7f9..374ad17 100644 --- a/phpunit.xml.dist +++ b/phpunit.xml @@ -7,8 +7,7 @@ convertNoticesToExceptions="true" convertWarningsToExceptions="true" processIsolation="false" - stopOnFailure="true" - syntaxCheck="false"> + stopOnFailure="true"> ./tests/ diff --git a/src/Bayes.php b/src/Bayes.php index 0559a34..1efde56 100755 --- a/src/Bayes.php +++ b/src/Bayes.php @@ -1,6 +1,5 @@ options = $options; if (!$this->options) { $this->options = []; } // set default tokenizer - $this->tokenizer = function($text) { + $this->tokenizer = function ($text) use ($that) { // convert everything to lowercase - $text = strtolower($text); + $text = mb_strtolower($text); // split the words preg_match_all('/[[:alpha:]]+/u', $text, $matches); @@ -50,17 +50,19 @@ public function __construct($options = null) { }; if (isset($this->options['tokenizer'])) { - $this->tokenizer = $this->options['tokenizer']; + $this->tokenizer = $this->options['tokenizer']; } $this->reset(); } /** - * reset the bayes class + * Reset the bayes class + * * @return Bayes */ - public function reset() { + public function reset() + { // hashmap of our category names $this->categories = []; @@ -72,7 +74,7 @@ public function reset() { $this->totalDocuments = 0; // initialize our vocabulary and its size - $this->vocabulary = []; + $this->vocabulary = []; $this->vocabularySize = 0; // for each category, how many words total were mapped to it @@ -82,18 +84,17 @@ public function reset() { // => for each category, how frequent was a given word mapped to it $this->wordFrequencyCount = []; - // array of stopwords - $this->stopWords = []; - return $this; } /** - * deserialize from json + * Deserialize from json + * * @param object $json string or array * @return Bayes */ - public function fromJson($json) { + public function fromJson($json) + { $result = $json; // deserialize from json if (is_string($json)) { @@ -103,7 +104,7 @@ public function fromJson($json) { $this->reset(); // deserialize from json - foreach($this->STATE_KEYS as $k) { + foreach ($this->STATE_KEYS as $k) { if (isset($result[$k])) { $this->{$k} = $result[$k]; } @@ -113,14 +114,16 @@ public function fromJson($json) { } /** - * serialize to json + * Serialize to json + * * @return string the json string */ - public function toJson() { + public function toJson() + { $result = []; // serialize to json - foreach($this->STATE_KEYS as $k) { + foreach ($this->STATE_KEYS as $k) { $result[$k] = $this->{$k}; } @@ -128,16 +131,18 @@ public function toJson() { } /** - * make sure the category exists in dictionary + * Make sure the category exists in dictionary + * * @param string $categoryName * @return Bayes */ - public function initializeCategory($categoryName) { + public function initializeCategory($categoryName) + { if (!isset($this->categories[$categoryName])) { - $this->docCount[$categoryName] = 0; - $this->wordCount[$categoryName] = 0; + $this->docCount[$categoryName] = 0; + $this->wordCount[$categoryName] = 0; $this->wordFrequencyCount[$categoryName] = []; - $this->categories[$categoryName] = true; + $this->categories[$categoryName] = true; } return $this; @@ -145,11 +150,13 @@ public function initializeCategory($categoryName) { /** * Teach your classifier + * * @param string $text * @param string $category * @return Bayes */ - public function learn($text, $category) { + public function learn($text, $category) + { $self = $this; // initialize category data structures if we've never seen this category @@ -168,7 +175,7 @@ public function learn($text, $category) { $frequencyTable = $self->frequencyTable($tokens); // Update vocabulary and word frequency count for this category - foreach($frequencyTable as $token => $frequencyInText) { + foreach ($frequencyTable as $token => $frequencyInText) { // add this word to our vocabulary if not already existing if (!isset($self->vocabulary[$token])) { $self->vocabulary[$token] = true; @@ -177,10 +184,9 @@ public function learn($text, $category) { // update the frequency information for this word in this category if (!isset($self->wordFrequencyCount[$category][$token])) { - $self->wordFrequencyCount[$category][$token] = $frequencyInText; - } - else { - $self->wordFrequencyCount[$category][$token] += $frequencyInText; + $self->wordFrequencyCount[$category][$token] = $frequencyInText; + } else { + $self->wordFrequencyCount[$category][$token] += $frequencyInText; } // update the count of all words we have seen mapped to this category @@ -195,30 +201,31 @@ public function learn($text, $category) { * @param string $text * @return string the category or null */ - public function categorize($text) { - $self = $this; + public function categorize($text) + { + $self = $this; $maxProbability = -INF; $chosenCategory = null; if ($self->totalDocuments > 0) { - $tokens = ($self->tokenizer)($text); + $tokens = ($self->tokenizer)($text); $frequencyTable = $self->frequencyTable($tokens); - // iterate thru our categories to find the one with max probability for this text - foreach($self->categories as $category => $value) { + // iterate thru our categories to find the one with max probability + // for this text + foreach ($self->categories as $category => $value) { $categoryProbability = $self->docCount[$category] / $self->totalDocuments; - $logProbability = log($categoryProbability); - foreach($frequencyTable as $token => $frequencyInText) { + $logProbability = log($categoryProbability); + foreach ($frequencyTable as $token => $frequencyInText) { $tokenProbability = $self->tokenProbability($token, $category); - // console.log('token: %s category: `%s` tokenProbability: %d', token, category, tokenProbability) // determine the log of the P( w | c ) for this word $logProbability += $frequencyInText * log($tokenProbability); } if ($logProbability > $maxProbability) { - $maxProbability = $logProbability; - $chosenCategory = $category; + $maxProbability = $logProbability; + $chosenCategory = $category; } } } @@ -227,12 +234,14 @@ public function categorize($text) { } /** - * calculate the probability that a `token` belongs to a `category` + * Calculate the probability that a `token` belongs to a `category` + * * @param string $token * @param string $category * @return number the probability */ - public function tokenProbability($token, $category) { + public function tokenProbability($token, $category) + { // how many times this word has occurred in documents mapped to this category $wordFrequencyCount = 0; if (isset($this->wordFrequencyCount[$category][$token])) { @@ -254,14 +263,14 @@ public function tokenProbability($token, $category) { * @param array $tokens array of string * @return array hashmap of token frequency */ - public function frequencyTable($tokens) { + public function frequencyTable($tokens) + { $frequencyTable = []; // print(json_encode($tokens)); - foreach($tokens as $token) { + foreach ($tokens as $token) { if (!isset($frequencyTable[$token])) { $frequencyTable[$token] = 1; - } - else { + } else { $frequencyTable[$token]++; } } diff --git a/tests/BayesTests.php b/tests/BayesTests.php index 3d3b7a5..b87db34 100644 --- a/tests/BayesTests.php +++ b/tests/BayesTests.php @@ -1,6 +1,6 @@