Demo 0 | Demo 1 | Demo 2 | Demo 3 | Demo 4 | Demo 5 | Code | Matt Smith

Code

<?php
//Text Analysis Tools
//Author: Matt Smith

class TextMiner {

    private 
$text;
    private 
$keywords;
    private 
$keywordCounts;
    public 
$removeStopWords;
    const 
stop_words_file="stop_words.txt";
    const 
verbose=FALSE;
        
    public function 
__construct(){
        
$this->clear();
    }
    public function 
addText($text){
        
$this->text .= ' || '.$text;
    }
    public function 
addFile($filename){
        
$text file_get_contents($filename);
        if(
$text!=FALSE) {
            return 
$this->addText($text);
        } else {
            return 
false;
        }
    }
    public function 
clear(){
        
$this->text='';
        
$this->keywords=array();
        
$this->keywordCounts=array();
        
$this->processed=FALSE;
        
$this->removeStopWords=TRUE;
    }
    
    public function 
process(){
        
$this->cleanText();
        
$this->identifyKeywords();
        
$this->countKeywords();
        
$this->processed TRUE;
    }
    
    public function 
setText($text){
        
$this->text $text;
    }
    public function 
getText(){
        return 
$this->text;
    }
    public function 
setKeywords($keywords){
        
$this->keywords $keywords;
    }
    public function 
getKeywords(){
        return 
$this->keywords;
    }
    public function 
setKeywordCounts($keywordCounts){
        
arsort($keywordCounts);
        
$this->keywordCounts $keywordCounts;
    }
    public function 
getKeywordCounts(){
        if(!
$this->processed) return "Run process first.";
        return 
$this->keywordCounts;
    }
    public function 
getTopKeywords($n=10,$as_array=TRUE){
        
$result array_slice($this->keywordCounts,0,$n,TRUE);
        if(
$as_array) {
            return 
$result;
        } else {
            return 
implode(', ',array_keys($result));
        }
    }
    public function 
printSummary(){
        echo 
"======================<br/>";
        echo 
"Text: <b>".trim(substr($this->getText(),0,200))."...</b><br/>";
        echo 
"Total Keywords: <b>".count($this->getKeywords())."</b><br/>";
        echo 
"======================<br/>";
    }

    
//PRIVATE METHODS
    
private function cleanText() {
        
        
$searchReplace = array(
            
//REMOVALS
            
"'<script[^>]*?>.*?</script>'si" => " " //Strip out Javascript
            
"'<style[^>]*?>.*?</style>'si" => " " //Strip out Styles
            
"'<[/!]*?[^<>]*?>'si" => " " //Strip out HTML tags
            //ACCEPT ONLY
            
"/[^a-zA-Z0-9\-' ]/" => " " //only accept these characters
          
        
);
        foreach(
$searchReplace as $s=>$r){
            
$search[]=$s;
            
$replace[]=$r;
        }
        
$this->setText(utf8_encode($this->getText()));
        
$this->setText(html_entity_decode($this->getText()));
        
$this->setText(strtolower($this->getText()));
        
//$this->setText(strip_tags($this->text));
        //if(self::verbose) { echo "<hr>BEFORE<hr><pre>"; echo $this->getText(); echo "</pre>";}
        
$this->setText(preg_replace($search$replace$this->getText()));
        
//if(self::verbose) { echo "<hr>AFTER<hr><pre>"; print_r( preg_split('/\s+/',$this->getText()) ); echo "</pre>";}
    
}
    private function 
identifyKeywords() {
        
$keywords preg_split('/\s+/',$this->getText());
        if(
$this->removeStopWords$keywords $this->removeStopWords($keywords);
        
$this->setKeywords($keywords); 
    }
    private function 
countKeywords () {
        
$this->setKeywordCounts(array_count_values($this->getKeywords()));
    }
    
    
//STATIC METHODS - can be called without having an instance of TextMiner
    //removeStopWords: removes the stopwords from a referenced array
    
public static function removeStopWords (&$words) { //expects an array ([0] = w1, [1] = w2, etc.)
        
$numWordsIn count($words);
        if(
self::verbose) { echo "removedStopWords => wordcount (IN: ".$numWordsIn.") "; }
        if(
file_exists(self::stop_words_file)) {
            
$stopWords explode("\n",strtolower(file_get_contents(self::stop_words_file)));
        } else {
            
$stopWords = array("","the","and","a","of","by","although","i","to","in","on","at","but","or","nor","for");
        }
        
//printa($stopWords);
        
$words array_diff($words,$stopWords);
        
$numWordsOut count($words);
        if(
self::verbose) { echo " (OUT: ".$numWordsOut.") Removed: ".($numWordsIn-$numWordsOut)."<br/>"; }
        return 
$words;
    }
    public static function 
getStemCounts(&$words,$minSupport=5) {
        require(
"Stemming.php");
        
$stems = array();

        foreach(
$words as $word) {
            if(
substr($word,0,1) != "#" && strlen($word)) { //indicates a comment

                
$string =strtolower($word); //string to be stemmed
                
$stm = new Stemming();
                
$stm->reset();
                for(
$i=0;$i<strlen($string);$i++)
                    
$stm->add(substr($string,$i,1));
                
$stm->stem(0);
                
$stem $stm->toString(); //stem of the string

                
if(self::verbose) {
                    echo 
"<b>$string</b> => <b style='color: blue;'>$stem</b><br/>";
                    if(!
is_numeric($stem) && !is_string($stem)) echo "[".$stem."]";
                }
                if(
array_key_exists($stem,$stems)) {
                    
$stems[$stem] .= ", ".$string;
                    
$stems_ctr[$stem]++;
                } else {
                    
$stems[$stem] = $string;
                    
$stems_ctr[$stem] = 1;
                }
            }
        }
        
arsort($stems_ctr);
        return 
$stems_ctr;
    }
    public static function 
outputStemTable(&$words,$minSupport=5){
        
$stems_ctr self::getStemCounts($words,$minSupport);

        echo 
"<table style='border: 1px solid #aaa;'><tr style='background: #aa0000; color: white; font-weight: bold;'><td>STEM</td><td>WORDS</td></tr>";
        foreach(
$stems_ctr as $stem=>$word_count) {
            if(
$word_count>=$minSupport) echo "<tr><td style='background: #efefef; font-weight: bold;'>".$stem."</td><td>".$stems[$stem]." (".$word_count.")<td/></tr>";
        }
        echo 
"</table>";
    }
}

if(!
function_exists('printa')) {
    function 
printa($array){
        echo 
"<pre>";
        
print_r($array);
        echo 
"</pre>";
    }
}


?>