Binary Bayesian Filter - PHP

bayesian machine learning nlp php

This is a simple Binary Bayesian Filter. The reason for the Interfaces and Abstract classes is because I am still mid-ways through my Java to PHP port :)

Also notice that this model calculates

P(A|B)
by assigning probability values rather than frequency counts.
P(A|B) = P(A & B) / P(B)

Sample Usage

$neg = File::read(LIB_DIR . 'classify/bayes/data/NEG'); // my file loading methods, you can use your own
$pos = File::read(LIB_DIR . 'classify/bayes/data/POS');

$bayes = new PositiveNegativeBayesianFilter();
$bayes->trainBad($neg);
$bayes->trainGood($pos);
$bayes->finalizeTraining();

$reviews = array(
    'I hate this stupid website',
    'I love life, life is awesomme',
    'blah blah blah'
);

// live sentiment analysis
for($i = 0; $i < count($reviews); $i++) {
    $score = $bayes->analyze($reviews[$i]);
    echo $reviews[$i] . ' = ' . $score . '<br/>';
}

PositiveNegativeBayesianFilter.php

<?php

require_once("AbstractBayesianBinaryFilter.php");

/**
 *
 * @author Kenny
 */
class PositiveNegativeBayesianFilter extends AbstractBayesianBinaryFilter {

    // How to split the String into tokens
    private $splitregex;

    public function __construct() {
        $this->splitregex = "/\\w+/i";
    }

    public function parseTokens($content) {
        $matches = array();
        preg_match_all($this->splitregex, $content, $matches);
        return $matches[0];
    }

    public function setSplitRegex($splitregex) {
        $this->splitregex = $splitregex;
    }

    public function trainBad($content) {
        $tokens = $this->parseTokens($content);
        $spamTotal = 0;

        // For every word token
        // d($tokens);
        for ($i = 0; $i < count($tokens); $i++) {
            $word = strtolower($tokens[$i]);
            $word = preg_replace("/[^A-Za-z0-9\s\s+]/", "", $word);
            if ($word != "") {

                $spamTotal++;
                // If it exists in the HashMap already
                // Increment the count
                if (array_key_exists($word, $this->words)) {
                    $this->words[$word]->countBad();
                } else {  // Otherwise it's a new word so add it
                    $w = new Word($word);
                    $w->countBad();
                    $this->words[$word] = $w;
                }
            }
        }
        // Go through all the words and divide
        // by total words
        foreach ($this->words as $k => $w) {
            $w->calcBadProb($spamTotal);
        }
    }

    public function trainGood($content) {
        $tokens = $this->parseTokens($content);
        $goodTotal = 0;

        // For every word token
        for ($i = 0; $i < count($tokens); $i++) {
            $word = strtolower($tokens[$i]);
            $word = preg_replace("/[^A-Za-z0-9\s\s+]/", "", $word);
            if ($word != "") {
                $goodTotal++;
                // If it exists in the HashMap already
                // Increment the count
                if (array_key_exists($word, $this->words)) {
                    $this->words[$word]->countGood();
                } else {  // Otherwise it's a new word so add it
                    $w = new Word($word);
                    $w->countGood();
                    $this->words[$word] = $w;
                }
            }
        }
        // Go through all the words and divide
        // by total words
        foreach ($this->words as $k => $w) {
            $w->calcGoodProb($goodTotal, 2.0);
        }
    }

    /**
     * This method is derived from Paul Graham:
     * http://www.paulgraham.com/spam.html
     */
    public function analyze($content) {

        $tokens = $this->parseTokens($content);
        $interesting = $this->getInterestingWords($tokens, 15);

        // Apply Bayes' rule (via Graham)
        $pposproduct = 1.0;
        $pnegproduct = 1.0;
        // For every word, multiply Spam probabilities ("Pneg") together
        // (As well as 1 - Pneg)
        for ($i = 0; $i < count($interesting); $i++) {
            $pposproduct *= $interesting[$i]->getPNegative();
            $pnegproduct *= (1.0 - $interesting[$i]->getPNegative());
        }

        // Apply formula
        $pPos = $pnegproduct / ($pposproduct + $pnegproduct);
        return $pPos;
    }

    /**
     * Create an arraylist of <limit> most "interesting" words
     * Words are most interesting based on how different their BAD
     * probability is from 0.5
     * @param content
     * @param size
     * @return
     */
    private function getInterestingWords($tokens, $limit) {
        $interesting = array();

        // For every word in the String to be analyzed
        for ($i = 0; $i < count($tokens); $i++) {
            $word = strtolower($tokens[$i]);
            $word = preg_replace("/[^A-Za-z0-9\s\s+]/", "", $word);
            if ($word != "") {
                // d($word);
                $w;
                // If the String is in our HashMap get the word out
                if (array_key_exists($word, $this->words)) {
                    $w = $this->words[$word];
                    // Otherwise, make a new word with a Bad probability of 0.5;
                } else {
                    $w = new Word($word);
                    $w->setPNegative(0.4);
                }

                // If this list is empty, then add this word in!
                if (count($interesting) == 0) {
                    $interesting[] = $w;
                    // Otherwise, add it in sorted order by interesting level
                } else {
                    for ($j = 0; $j < count($interesting); $j++) {
                        // For every word in the list already
                        $nw = $interesting[$j];
                        // If it's the same word, don't bother
                        if ($w->getWord() == $interesting[$j]->getWord()) {
                            break;
                            // If it's more interesting stick it in the list
                        } else if ($w->interesting() > $interesting[$j]->interesting()) {
                            ArrayUtils::insertAt($j, $w, $interesting);
                            break;
                            // If we get to the end, just tack it on there
                        } else if ($j == count($interesting) - 1) {
                            $interesting[] = $w;
                        }
                    }
                }

                // If the list is bigger than the limit, delete entries
                // at the end (the more "interesting" ones are at the
                // start of the list
                while (count($interesting) > $limit) {
                    array_pop($interesting);
                }
            }
        }
//        for ($j = 0; $j < count($interesting); $j++) {
//            d('word: ' . $interesting[$j]->getWord() . ' pBad: ' . $interesting[$j]->getPBad()  . ' pGood: ' . $interesting[$j]->getPGood());
//        }
        return $interesting;
    }

}

AbstractBayesianBinaryFilter.php

<?php

require_once("IBayesianBinaryFilter.php");
require_once("Word.php");
/**
 * Description of AbstractBayesianBinaryFilter
 *
 * @author Kenny
 */
abstract class AbstractBayesianBinaryFilter implements IBayesianBinaryFilter {

    // A Hashmap maping string to Word
    protected $words = array();

    public function displayStats() {
        foreach($this->words as $k => $word) {
            if($word != null) {
                echo $k . " " . $word->getNegative();
            }
        }
    }

    public function  finalizeTraining() {
        foreach($this->words as $k => $word) {
            $word->finalizeProb();
           // echo 'word: ' .  $word->getWord() . ' pNeg: ' .  $word->getPBad() . ', pGood: ' .  $word->getPGood() . '<br/>';
        }
    }

}

IBayesianBinaryFilter.php

<?php

require_once("IBayesianFilter.php");
/**
 *
 * @author destructo
 */
interface IBayesianBinaryFilter extends IBayesianFilter {

    function trainBad($content);

    function trainGood($content);

    function parseTokens($content);
}

IBayesianFilter.php

<?php

/**
 *
 * @author Kenny
 */
interface IBayesianFilter {

    function finalizeTraining();

    function analyze($content);

    function displayStats();

}

IBayesianFilter.php

<?php

/**
 *
 * @author Kenny
 */
interface IBayesianFilter {

    function finalizeTraining();

    function analyze($content);

    function displayStats();

}

Word.php

<?php

/**
 * Description of Word
 *
 * @author Kenny
 */
class Word {

    private $word; // The String itself
    private $countBad; // The total times it appears in "bad" messages
    private $countGood; // The total times it appears in "good" messages
    private $rBad; // bad count / total bad words
    private $rGood; // good count / total good words
    private $pNeg; // probability this word is negative
    private $pGood; // probability this word is positive

    // Create a word, initialize all vars to 0

    public function __construct($word) {
        $this->word = $word;
        $this->countBad = 0;
        $this->countGood = 0;
        $this->rBad = 0.0;
        $this->rGood = 0.0;
        $this->pNeg = 0.0;
        $this->pGood = 0.0;
    }

    // Increment bad counter
    public function countBad() {
        $this->countBad++;
    }

    // Increment good counter
    public function countGood() {
        $this->countGood++;
    }

    // Computer how often this word is bad
    public function calcBadProb($total, $biasBad = 1.0) {
        if ($total > 0) {
            $this->rBad = $biasBad * $this->countBad / $total;
        }
    }

    public function calcGoodProb($total, $biasGood = 1.0) {
        if ($total > 0) {
            $this->rGood = $biasGood * $this->countGood / $total;
        }
    }

    // Implement bayes rules to computer how likely this word is "negative"
    public function finalizeProb() {
        if ($this->rGood + $this->rBad > 0) {
            $this->pNeg = $this->rBad / ($this->rBad + $this->rGood);
            $this->pGood = $this->rGood / ($this->rBad + $this->rGood);
        }
        if ($this->pNeg < 0.01) {
            $this->pNeg = 0.01;
        } else if ($this->pNeg > 0.99) {
            $this->pNeg = 0.99;
        }
        if ($this->pGood < 0.01) {
            $this->pGood = 0.01;
        } else if ($this->pGood > 0.99) {
            $this->pGood = 0.99;
        }
        // echo 'word: ' . $this->word . ' pNeg: ' . $this->pNeg . ', pGood: ' . $this->pGood . '<br/>';
    }

    // The "interesting" rating for a word is
    // How different from 0.5 it is
    public function interesting() {
        return abs(0.5 - $this->pNeg);
    }

    public function getPGood() {
        return $this->rGood;
    }

    public function getPBad() {
        return $this->rBad;
    }

    public function getPNegative() {
        return $this->pNeg;
    }

    public function setPNegative($f) {
        $this->pNeg = $f;
    }

    public function getPPositive() {
        return $this->pGood;
    }

    public function setPPositive($f) {
        $this->pGood = $f;
    }

    public function getWord() {
        return $this->word;
    }

}