Demo 0 | Demo 1 | Demo 2 | Demo 3 | Demo 4 | Demo 5 | Code | Matt Smith

Code

<?php
//Text Analysis Tools
//Author: Matt Smith
//Supports NGrams

class TextMiner {

    public static 
$version "2.0";
    private 
$text;
    private 
$N;
    private 
$unigrams;
    private 
$nGrams;
    private 
$nGramCounts;

    
//Options
    
public $removeStopWords;
    public 
$includeLowerNGrams;
    public 
$convertToLower;
    const 
stop_words_file="stop_words.txt";
    const 
verbose=FALSE;

    public function 
__construct(){
        
$this->clear();
    }
    public function 
addText($text){
        
$this->text .= ' || '.$text;
    }
    public function 
addFile($filename){
        
$text file_get_contents($filename);
        if(
$text!=FALSE) {
            return 
$this->addText($text);
        } else {
            return 
false;
        }
    }
    public function 
clear(){
        
$this->text='';
        
$this->N=2//default to bi-grams
        
$this->unigrams=array();
        
$this->nGrams=array();
        
$this->nGramCounts=array();
        
$this->processed=FALSE;
        
$this->removeStopWords=TRUE;
        
$this->includeLowerNGrams=FALSE;
        
$this->convertToLower=FALSE;
    }

    public function 
process(){
        
$this->cleanText();
        
$this->identifyNGrams();
        
$this->countNGrams();
        
$this->processed TRUE;
    }
    public function 
setN($N){
        
$this->$N;
    }
    public function 
getN(){
        return 
$this->N;
    }
    public function 
setText($text){
        
$this->text $text;
    }
    public function 
getText(){
        return 
$this->text;
    }
    public function 
setNGrams($nGrams){
        
$this->nGrams $nGrams;
    }
    private function 
addNGrams($nGrams){
        foreach(
$nGrams as $nGram){
            
$this->nGrams[] = $nGram;
        }
    }
    public function 
getNGrams(){
        return 
$this->nGrams;
    }
    public function 
setNGramCounts($nGramCounts){
        
arsort($nGramCounts);
        
$this->nGramCounts $nGramCounts;
    }
    public function 
getNGramCounts(){
        if(!
$this->processed) return "Run process first.";
        return 
$this->nGramCounts;
    }
    public function 
getTopNGrams($n=10,$as_array=TRUE){
        
$results array_slice($this->nGramCounts,0,$n,TRUE);
        if(
$as_array) {
            return 
$results;
        } else {
            return 
implode(', ',array_keys($results));
        }
    }
    public function 
printSummary(){
        echo 
"======================<br/>";
        echo 
"Text: <b>".trim(substr($this->getText(),0,200))."...</b><br/>";
        echo 
"Total nGrams: <b>".count($this->getNGrams())."</b><br/>";
        echo 
"======================<br/>";
    }

    
//PRIVATE METHODS
    
private function cleanText() {

        
$searchReplace = array(
            
//REMOVALS
            
"'<script[^>]*?>.*?</script>'si" => " " //Strip out Javascript
            
"'<style[^>]*?>.*?</style>'si" => " " //Strip out Styles
            
"'<[/!]*?[^<>]*?>'si" => " " //Strip out HTML tags
            //ACCEPT ONLY
            
"/[^a-zA-Z0-9\-' ]/" => " " //only accept these characters

        
);
        foreach(
$searchReplace as $s=>$r){
            
$search[]=$s;
            
$replace[]=$r;
        }
        
$this->setText(utf8_encode($this->getText()));
        
$this->setText(html_entity_decode($this->getText()));
        if(
$this->convertToLower$this->setText(strtolower($this->getText()));
        
//$this->setText(strip_tags($this->text));
        //if(self::verbose) { echo "<hr>BEFORE<hr><pre>"; echo $this->getText(); echo "</pre>";}
        
$this->setText(preg_replace($search$replace$this->getText()));
        
//if(self::verbose) { echo "<hr>AFTER<hr><pre>"; print_r( preg_split('/\s+/',$this->getText()) ); echo "</pre>";}
    
}
    private function 
identifyNGrams($N=null) {
        if(
$N==null$N=$this->N;
        
$numUnigrams count($this->unigrams);
        if(
$numUnigrams==0) {
            
$this->identifyUnigrams();
            
$numUnigrams count($this->unigrams);
        }
        if(
$N>1){
            
$nGrams = array();
            for(
$i=($N-1); $i<$numUnigrams$i++){
                
$nGram "";
                for(
$j=0$j<$N$j++){
                    
$nGram $this->unigrams[$i-$j].' '.trim($nGram);
                }
                
$nGrams[] = trim($nGram);
            }
        } else {
            
$nGrams $this->unigrams;
        }
        
//if($this->removeStopWords) $nGrams = $this->removeStopWords($nGrams);
        
$this->addNGrams($nGrams);
        if(
$this->includeLowerNGrams && $N>1) {
            
$this->identifyNGrams($N-1);
        }
    }
    private function 
identifyUnigrams(){
        
$unigrams preg_split('/\s+/',trim($this->getText()));
        if(
$this->removeStopWords) {
            
$this->unigrams $this->removeStopWords($unigrams);
        } else {
            
$this->unigrams=$unigrams;
        } 
// printa($this->unigrams);
    
}
    private function 
countNGrams() {
        
$nGramCounts array_count_values($this->getNGrams());
        
/*if(1||$this->removeRedundantLesserGrams){
            arsort($nGramCounts);
            foreach($nGramCounts as $k=>$v){
                echo "$k:$v\n";
            }
        }*/
        
$this->setNGramCounts($nGramCounts);
    }

    
//STATIC METHODS - can be called without having an instance of TextMiner
    //removeStopWords: removes the stopwords from a referenced array
    
public static function removeStopWords (&$words) { //expects an array ([0] = w1, [1] = w2, etc.)
        
$numWordsIn count($words);
        if(
self::verbose) { echo "removedStopWords => wordcount (IN: ".$numWordsIn.") "; }
        if(
file_exists(self::stop_words_file)) {
            
$stopWords explode("\n",strtolower(file_get_contents(self::stop_words_file)));
        } else {
            
$stopWords = array("","the","and","a","of","by","although","i","to","in","on","at","but","or","nor","for","-");
        }
        
//printa($stopWords);
        
$words array_diff($words,$stopWords);
        
$words array_values($words);//re-indexes array
        
$numWordsOut count($words);
        if(
self::verbose) { echo " (OUT: ".$numWordsOut.") Removed: ".($numWordsIn-$numWordsOut)."<br/>"; }
        return 
$words;
    }
    public static function 
getStemCounts(&$words,$minSupport=5) {
        require(
"Stemming.php");
        
$stems = array();

        foreach(
$words as $word) {
            if(
substr($word,0,1) != "#" && strlen($word)) { //indicates a comment

                
$string =strtolower($word); //string to be stemmed
                
$stm = new Stemming();
                
$stm->reset();
                for(
$i=0;$i<strlen($string);$i++)
                
$stm->add(substr($string,$i,1));
                
$stm->stem(0);
                
$stem $stm->toString(); //stem of the string

                
if(self::verbose) {
                    echo 
"<b>$string</b> => <b style='color: blue;'>$stem</b><br/>";
                    if(!
is_numeric($stem) && !is_string($stem)) echo "[".$stem."]";
                }
                if(
array_key_exists($stem,$stems)) {
                    
$stems[$stem] .= ", ".$string;
                    
$stems_ctr[$stem]++;
                } else {
                    
$stems[$stem] = $string;
                    
$stems_ctr[$stem] = 1;
                }
            }
        }
        
arsort($stems_ctr);
        return 
$stems_ctr;
    }
    public static function 
outputStemTable(&$words,$minSupport=5){
        
$stems_ctr self::getStemCounts($words,$minSupport);

        echo 
"<table style='border: 1px solid #aaa;'><tr style='background: #aa0000; color: white; font-weight: bold;'><td>STEM</td><td>WORDS</td></tr>";
        foreach(
$stems_ctr as $stem=>$word_count) {
            if(
$word_count>=$minSupport) echo "<tr><td style='background: #efefef; font-weight: bold;'>".$stem."</td><td>".$stems[$stem]." (".$word_count.")<td/></tr>";
        }
        echo 
"</table>";
    }
}

if(!
function_exists('printa')) {
    function 
printa($array){
        echo 
"<pre>";
        
print_r($array);
        echo 
"</pre>";
    }
}


?>