Code
<?php
//Text Analysis Tools
//Author: Matt Smith
class TextMiner {
private $text;
private $keywords;
private $keywordCounts;
public $removeStopWords;
const stop_words_file="stop_words.txt";
const verbose=FALSE;
public function __construct(){
$this->clear();
}
public function addText($text){
$this->text .= ' || '.$text;
}
public function addFile($filename){
$text = file_get_contents($filename);
if($text!=FALSE) {
return $this->addText($text);
} else {
return false;
}
}
public function clear(){
$this->text='';
$this->keywords=array();
$this->keywordCounts=array();
$this->processed=FALSE;
$this->removeStopWords=TRUE;
}
public function process(){
$this->cleanText();
$this->identifyKeywords();
$this->countKeywords();
$this->processed = TRUE;
}
public function setText($text){
$this->text = $text;
}
public function getText(){
return $this->text;
}
public function setKeywords($keywords){
$this->keywords = $keywords;
}
public function getKeywords(){
return $this->keywords;
}
public function setKeywordCounts($keywordCounts){
arsort($keywordCounts);
$this->keywordCounts = $keywordCounts;
}
public function getKeywordCounts(){
if(!$this->processed) return "Run process first.";
return $this->keywordCounts;
}
public function getTopKeywords($n=10,$as_array=TRUE){
$result = array_slice($this->keywordCounts,0,$n,TRUE);
if($as_array) {
return $result;
} else {
return implode(', ',array_keys($result));
}
}
public function printSummary(){
echo "======================<br/>";
echo "Text: <b>".trim(substr($this->getText(),0,200))."...</b><br/>";
echo "Total Keywords: <b>".count($this->getKeywords())."</b><br/>";
echo "======================<br/>";
}
//PRIVATE METHODS
private function cleanText() {
$searchReplace = array(
//REMOVALS
"'<script[^>]*?>.*?</script>'si" => " " //Strip out Javascript
, "'<style[^>]*?>.*?</style>'si" => " " //Strip out Styles
, "'<[/!]*?[^<>]*?>'si" => " " //Strip out HTML tags
//ACCEPT ONLY
, "/[^a-zA-Z0-9\-' ]/" => " " //only accept these characters
);
foreach($searchReplace as $s=>$r){
$search[]=$s;
$replace[]=$r;
}
$this->setText(utf8_encode($this->getText()));
$this->setText(html_entity_decode($this->getText()));
$this->setText(strtolower($this->getText()));
//$this->setText(strip_tags($this->text));
//if(self::verbose) { echo "<hr>BEFORE<hr><pre>"; echo $this->getText(); echo "</pre>";}
$this->setText(preg_replace($search, $replace, $this->getText()));
//if(self::verbose) { echo "<hr>AFTER<hr><pre>"; print_r( preg_split('/\s+/',$this->getText()) ); echo "</pre>";}
}
private function identifyKeywords() {
$keywords = preg_split('/\s+/',$this->getText());
if($this->removeStopWords) $keywords = $this->removeStopWords($keywords);
$this->setKeywords($keywords);
}
private function countKeywords () {
$this->setKeywordCounts(array_count_values($this->getKeywords()));
}
//STATIC METHODS - can be called without having an instance of TextMiner
//removeStopWords: removes the stopwords from a referenced array
public static function removeStopWords (&$words) { //expects an array ([0] = w1, [1] = w2, etc.)
$numWordsIn = count($words);
if(self::verbose) { echo "removedStopWords => wordcount (IN: ".$numWordsIn.") "; }
if(file_exists(self::stop_words_file)) {
$stopWords = explode("\n",strtolower(file_get_contents(self::stop_words_file)));
} else {
$stopWords = array("","the","and","a","of","by","although","i","to","in","on","at","but","or","nor","for");
}
//printa($stopWords);
$words = array_diff($words,$stopWords);
$numWordsOut = count($words);
if(self::verbose) { echo " (OUT: ".$numWordsOut.") Removed: ".($numWordsIn-$numWordsOut)."<br/>"; }
return $words;
}
public static function getStemCounts(&$words,$minSupport=5) {
require("Stemming.php");
$stems = array();
foreach($words as $word) {
if(substr($word,0,1) != "#" && strlen($word)) { //indicates a comment
$string =strtolower($word); //string to be stemmed
$stm = new Stemming();
$stm->reset();
for($i=0;$i<strlen($string);$i++)
$stm->add(substr($string,$i,1));
$stm->stem(0);
$stem = $stm->toString(); //stem of the string
if(self::verbose) {
echo "<b>$string</b> => <b style='color: blue;'>$stem</b><br/>";
if(!is_numeric($stem) && !is_string($stem)) echo "[".$stem."]";
}
if(array_key_exists($stem,$stems)) {
$stems[$stem] .= ", ".$string;
$stems_ctr[$stem]++;
} else {
$stems[$stem] = $string;
$stems_ctr[$stem] = 1;
}
}
}
arsort($stems_ctr);
return $stems_ctr;
}
public static function outputStemTable(&$words,$minSupport=5){
$stems_ctr = self::getStemCounts($words,$minSupport);
echo "<table style='border: 1px solid #aaa;'><tr style='background: #aa0000; color: white; font-weight: bold;'><td>STEM</td><td>WORDS</td></tr>";
foreach($stems_ctr as $stem=>$word_count) {
if($word_count>=$minSupport) echo "<tr><td style='background: #efefef; font-weight: bold;'>".$stem."</td><td>".$stems[$stem]." (".$word_count.")<td/></tr>";
}
echo "</table>";
}
}
if(!function_exists('printa')) {
function printa($array){
echo "<pre>";
print_r($array);
echo "</pre>";
}
}
?>