diff --git a/.editorconfig b/.editorconfig
index 5a1be3b..eca7ae8 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -1,15 +1,24 @@
root = true
[*]
+charset = utf-8
end_of_line = lf
insert_final_newline = true
-
-[*.php]
indent_style = space
-indent_size = 4
-charset = utf-8
+indent_size = 2
trim_trailing_whitespace = true
-[{composer.json,.travis.yml}]
+[*.md]
+trim_trailing_whitespace = false
+
+[{package.json,.babelrc}]
indent_style = space
indent_size = 2
+
+[*.{js,scss,vue,yml}]
+indent_style = space
+indent_size = 2
+
+[*.{php,json,conf}]
+indent_style = space
+indent_size = 4
diff --git a/.travis.yml b/.travis.yml
index bd58130..547e2f1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,5 @@
language: php
php:
- - '7.0'
- '7.1'
- '7.2'
- '7.3'
diff --git a/README.md b/README.md
index e17d279..231ca87 100644
--- a/README.md
+++ b/README.md
@@ -19,13 +19,13 @@ You can use this for categorizing any text content into any arbitrary set of **c
## Installing
```
-composer install niiknow/bayes
+composer install Niiknow/bayes
```
## Usage
```php
-$classifier = new \niiknow\Bayes();
+$classifier = new \Niiknow\Bayes();
// teach it positive phrases
@@ -51,7 +51,7 @@ $classifier->fromJson($stateJson);
## API
-### `$classifier = new \niiknow\Bayes([options])`
+### `$classifier = new \Niiknow\Bayes([options])`
Returns an instance of a Naive-Bayes Classifier.
diff --git a/composer.json b/composer.json
index ff9d432..485c6eb 100755
--- a/composer.json
+++ b/composer.json
@@ -33,7 +33,7 @@
},
"autoload": {
"psr-4": {
- "niiknow\\": "src/"
+ "Niiknow\\": "src/"
}
},
"autoload-dev": {
diff --git a/phpcs.xml b/phpcs.xml
new file mode 100644
index 0000000..a0ff1a2
--- /dev/null
+++ b/phpcs.xml
@@ -0,0 +1,107 @@
+
+
+
+ Coding Standards
+
+
+ src
+
+
+ */tests/*
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/phpunit.xml.dist b/phpunit.xml
similarity index 92%
rename from phpunit.xml.dist
rename to phpunit.xml
index 67fe7f9..374ad17 100644
--- a/phpunit.xml.dist
+++ b/phpunit.xml
@@ -7,8 +7,7 @@
convertNoticesToExceptions="true"
convertWarningsToExceptions="true"
processIsolation="false"
- stopOnFailure="true"
- syntaxCheck="false">
+ stopOnFailure="true">
./tests/
diff --git a/src/Bayes.php b/src/Bayes.php
index 0559a34..1efde56 100755
--- a/src/Bayes.php
+++ b/src/Bayes.php
@@ -1,6 +1,5 @@
options = $options;
if (!$this->options) {
$this->options = [];
}
// set default tokenizer
- $this->tokenizer = function($text) {
+ $this->tokenizer = function ($text) use ($that) {
// convert everything to lowercase
- $text = strtolower($text);
+ $text = mb_strtolower($text);
// split the words
preg_match_all('/[[:alpha:]]+/u', $text, $matches);
@@ -50,17 +50,19 @@ public function __construct($options = null) {
};
if (isset($this->options['tokenizer'])) {
- $this->tokenizer = $this->options['tokenizer'];
+ $this->tokenizer = $this->options['tokenizer'];
}
$this->reset();
}
/**
- * reset the bayes class
+ * Reset the bayes class
+ *
* @return Bayes
*/
- public function reset() {
+ public function reset()
+ {
// hashmap of our category names
$this->categories = [];
@@ -72,7 +74,7 @@ public function reset() {
$this->totalDocuments = 0;
// initialize our vocabulary and its size
- $this->vocabulary = [];
+ $this->vocabulary = [];
$this->vocabularySize = 0;
// for each category, how many words total were mapped to it
@@ -82,18 +84,17 @@ public function reset() {
// => for each category, how frequent was a given word mapped to it
$this->wordFrequencyCount = [];
- // array of stopwords
- $this->stopWords = [];
-
return $this;
}
/**
- * deserialize from json
+ * Deserialize from json
+ *
* @param object $json string or array
* @return Bayes
*/
- public function fromJson($json) {
+ public function fromJson($json)
+ {
$result = $json;
// deserialize from json
if (is_string($json)) {
@@ -103,7 +104,7 @@ public function fromJson($json) {
$this->reset();
// deserialize from json
- foreach($this->STATE_KEYS as $k) {
+ foreach ($this->STATE_KEYS as $k) {
if (isset($result[$k])) {
$this->{$k} = $result[$k];
}
@@ -113,14 +114,16 @@ public function fromJson($json) {
}
/**
- * serialize to json
+ * Serialize to json
+ *
* @return string the json string
*/
- public function toJson() {
+ public function toJson()
+ {
$result = [];
// serialize to json
- foreach($this->STATE_KEYS as $k) {
+ foreach ($this->STATE_KEYS as $k) {
$result[$k] = $this->{$k};
}
@@ -128,16 +131,18 @@ public function toJson() {
}
/**
- * make sure the category exists in dictionary
+ * Make sure the category exists in dictionary
+ *
* @param string $categoryName
* @return Bayes
*/
- public function initializeCategory($categoryName) {
+ public function initializeCategory($categoryName)
+ {
if (!isset($this->categories[$categoryName])) {
- $this->docCount[$categoryName] = 0;
- $this->wordCount[$categoryName] = 0;
+ $this->docCount[$categoryName] = 0;
+ $this->wordCount[$categoryName] = 0;
$this->wordFrequencyCount[$categoryName] = [];
- $this->categories[$categoryName] = true;
+ $this->categories[$categoryName] = true;
}
return $this;
@@ -145,11 +150,13 @@ public function initializeCategory($categoryName) {
/**
* Teach your classifier
+ *
* @param string $text
* @param string $category
* @return Bayes
*/
- public function learn($text, $category) {
+ public function learn($text, $category)
+ {
$self = $this;
// initialize category data structures if we've never seen this category
@@ -168,7 +175,7 @@ public function learn($text, $category) {
$frequencyTable = $self->frequencyTable($tokens);
// Update vocabulary and word frequency count for this category
- foreach($frequencyTable as $token => $frequencyInText) {
+ foreach ($frequencyTable as $token => $frequencyInText) {
// add this word to our vocabulary if not already existing
if (!isset($self->vocabulary[$token])) {
$self->vocabulary[$token] = true;
@@ -177,10 +184,9 @@ public function learn($text, $category) {
// update the frequency information for this word in this category
if (!isset($self->wordFrequencyCount[$category][$token])) {
- $self->wordFrequencyCount[$category][$token] = $frequencyInText;
- }
- else {
- $self->wordFrequencyCount[$category][$token] += $frequencyInText;
+ $self->wordFrequencyCount[$category][$token] = $frequencyInText;
+ } else {
+ $self->wordFrequencyCount[$category][$token] += $frequencyInText;
}
// update the count of all words we have seen mapped to this category
@@ -195,30 +201,31 @@ public function learn($text, $category) {
* @param string $text
* @return string the category or null
*/
- public function categorize($text) {
- $self = $this;
+ public function categorize($text)
+ {
+ $self = $this;
$maxProbability = -INF;
$chosenCategory = null;
if ($self->totalDocuments > 0) {
- $tokens = ($self->tokenizer)($text);
+ $tokens = ($self->tokenizer)($text);
$frequencyTable = $self->frequencyTable($tokens);
- // iterate thru our categories to find the one with max probability for this text
- foreach($self->categories as $category => $value) {
+ // iterate thru our categories to find the one with max probability
+ // for this text
+ foreach ($self->categories as $category => $value) {
$categoryProbability = $self->docCount[$category] / $self->totalDocuments;
- $logProbability = log($categoryProbability);
- foreach($frequencyTable as $token => $frequencyInText) {
+ $logProbability = log($categoryProbability);
+ foreach ($frequencyTable as $token => $frequencyInText) {
$tokenProbability = $self->tokenProbability($token, $category);
- // console.log('token: %s category: `%s` tokenProbability: %d', token, category, tokenProbability)
// determine the log of the P( w | c ) for this word
$logProbability += $frequencyInText * log($tokenProbability);
}
if ($logProbability > $maxProbability) {
- $maxProbability = $logProbability;
- $chosenCategory = $category;
+ $maxProbability = $logProbability;
+ $chosenCategory = $category;
}
}
}
@@ -227,12 +234,14 @@ public function categorize($text) {
}
/**
- * calculate the probability that a `token` belongs to a `category`
+ * Calculate the probability that a `token` belongs to a `category`
+ *
* @param string $token
* @param string $category
* @return number the probability
*/
- public function tokenProbability($token, $category) {
+ public function tokenProbability($token, $category)
+ {
// how many times this word has occurred in documents mapped to this category
$wordFrequencyCount = 0;
if (isset($this->wordFrequencyCount[$category][$token])) {
@@ -254,14 +263,14 @@ public function tokenProbability($token, $category) {
* @param array $tokens array of string
* @return array hashmap of token frequency
*/
- public function frequencyTable($tokens) {
+ public function frequencyTable($tokens)
+ {
$frequencyTable = [];
// print(json_encode($tokens));
- foreach($tokens as $token) {
+ foreach ($tokens as $token) {
if (!isset($frequencyTable[$token])) {
$frequencyTable[$token] = 1;
- }
- else {
+ } else {
$frequencyTable[$token]++;
}
}
diff --git a/tests/BayesTests.php b/tests/BayesTests.php
index 3d3b7a5..b87db34 100644
--- a/tests/BayesTests.php
+++ b/tests/BayesTests.php
@@ -1,6 +1,6 @@