<?php/* * * * funcs.mysql.boolean.php * * * * * * * * * * * * * * * * * * * * *
* * The following file contains functions for transforming search * strings into boolean SQL. To download the sample script and * dataset that use these functions, reference: * http://davidaltherr.net/web/php_functions/ * boolean/example.mysql.boolean.txt * * Copyright 2001 David Altherr * altherda@email.uc.edu * www.davidaltherr.net * * All material granted free for use under MIT general public license * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *//* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* :: get_fulltext_key($table) :: * retrieves the fulltext key from a table as a comma delimited * list of values. requires: * a. $mysqldb (selected database) * OR * b. $table argument in the form 'db.table' * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */function get_fulltext_key($table,$db_connect){ global $mysqldb; mysql_select_db($mysqldb,$db_connect);/* grab all keys of db.table */
$indices=mysql_query("SHOW INDEX FROM $table",$db_connect) or die(mysql_error()); $indices_rows=mysql_num_rows($indices);/* grab only fulltext keys */
for($nth=0;$nth<$indices_rows;$nth++){ $nth_index=mysql_result($indices,$nth,'Comment'); if($nth_index=='FULLTEXT'){ $match_a[].=mysql_result($indices,$nth,'Column_name'); } }/* delimit with commas */
$match=implode(',',$match_a);return $match;
}/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* :: boolean_mark_atoms($string) :: * used to identify all word atoms; works using simple * string replacement process: * 1. strip whitespace * 2. apply an arbitrary function to subject words * 3. represent remaining characters as boolean operators: * a. ' '[space] -> AND * b. ','[comma] -> OR * c. '-'[minus] -> NOT * 4. replace arbitrary function with actual sql syntax * 5. return sql string * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */function boolean_mark_atoms($string){ $result=trim($string); $result=preg_replace("/([[:space:]]{2,})/",' ',$result);/* convert normal boolean operators to shortened syntax */
$result=eregi_replace(' not ',' -',$result); $result=eregi_replace(' and ',' ',$result); $result=eregi_replace(' or ',',',$result);/* strip excessive whitespace */
$result=str_replace('( ','(',$result); $result=str_replace(' )',')',$result); $result=str_replace(', ',',',$result); $result=str_replace(' ,',',',$result); $result=str_replace('- ','-',$result);/* apply arbitrary function to all 'word' atoms */
$result=preg_replace( "/([A-Za-z0-9]{1,}[A-Za-z0-9\.\_-]{0,})/", "foo[('$0')]bar", $result);/* strip empty or erroneous atoms */
$result=str_replace("foo[('')]bar",'',$result); $result=str_replace("foo[('-')]bar",'-',$result);/* add needed space */
$result=str_replace(')foo[(',') foo[(',$result); $result=str_replace(')]bar(',')]bar (',$result);/* dispatch ' ' to ' AND ' */
$result=str_replace(' ',' AND ',$result);/* dispatch ',' to ' OR ' */
$result=str_replace(',',' OR ',$result);/* dispatch '-' to ' NOT ' */
$result=str_replace(' -',' NOT ',$result);return $result;
}/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * :: boolean_sql_where($string,$match) :: * function used to transform identified atoms into mysql * parseable boolean fulltext sql string; allows for * nesting by letting the mysql boolean parser evaluate * grouped statements * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */function boolean_sql_where($string,$match){ $result = boolean_mark_atoms($string);/* dispatch 'foo[(#)]bar to actual sql involving (#) */
$result=preg_replace( "/foo\[\(\'([^\)]{4,})\'\)\]bar/", " match ($match) against ('$1')>0 ", $result); $result=preg_replace( "/foo\[\(\'([^\)]{1,3})\'\)\]bar/e", " '('.boolean_sql_where_short(\"$1\",\"$match\").')' ", $result);return $result;
}/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * :: boolean_sql_where_short($string,$match) :: * parses short words <4 chars into proper SQL: special adaptive * case to force return of records without using fulltext index * keep in mind that allowing this functionality may have serious * performance issues, especially with large datasets * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */function boolean_sql_where_short($string,$match){ $match_a = explode(',',$match); for($ith=0;$ith<count($match_a);$ith++){ $like_a[$ith] = " $match_a[$ith] LIKE '%$string%' "; } $like = implode(" OR ",$like_a);return $like;
}/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* :: boolean_sql_select($string,$match) :: * function used to transform a boolean search string into a * mysql parseable fulltext sql string used to determine the * relevance of each record; * 1. put all subject words into array * 2. enumerate array elements into scoring sql syntax * 3. return sql string * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */function boolean_sql_select($string,$match){ /* build sql for determining score for each record */ preg_match_all( "([A-Za-z0-9]{1,}[A-Za-z0-9\-\.\_]{0,})", $string, $result); $result = $result[0]; for($cth=0;$cth<count($result);$cth++){ if(strlen($result[$cth])>=4){ $stringsum_long .= " $result[$cth] "; }else{ $stringsum_a[] = ' '.boolean_sql_select_short($result[$cth],$match).' '; } } if(strlen($stringsum_long)>0){ $stringsum_a[] = " match ($match) against ('$stringsum_long') "; } $stringsum .= implode("+",$stringsum_a); return $stringsum;}/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * :: boolean_sql_select_short($string,$match) :: * parses short words <4 chars into proper SQL: special adaptive * case to force 'scoring' of records without using fulltext index * keep in mind that allowing this functionality may have serious * performance issues, especially with large datasets * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */function boolean_sql_select_short($string,$match){ $match_a = explode(',',$match); $score_unit_weight = .2; for($ith=0;$ith<count($match_a);$ith++){ $score_a[$ith] = " $score_unit_weight*( LENGTH($match_a[$ith]) - LENGTH(REPLACE(LOWER($match_a[$ith]),LOWER('$string'),''))) /LENGTH('$string') "; } $score = implode(" + ",$score_a);return $score;
}/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * :: boolean_inclusive_atoms($string) :: * returns only inclusive atoms within boolean statement * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */function boolean_inclusive_atoms($string){$result=trim($string);
$result=preg_replace("/([[:space:]]{2,})/",' ',$result);/* convert normal boolean operators to shortened syntax */
$result=eregi_replace(' not ',' -',$result); $result=eregi_replace(' and ',' ',$result); $result=eregi_replace(' or ',',',$result);/* drop unnecessary spaces */
$result=str_replace(' ,',',',$result); $result=str_replace(', ',',',$result); $result=str_replace('- ','-',$result);/* strip exlusive atoms */
$result=preg_replace( "(\-\([A-Za-z0-9]{1,}[A-Za-z0-9\-\.\_\,]{0,}\))", '', $result); $result=preg_replace( "(\-[A-Za-z0-9]{1,}[A-Za-z0-9\-\.\_]{0,})", '', $result); $result=str_replace('(',' ',$result); $result=str_replace(')',' ',$result); $result=str_replace(',',' ',$result);return $result;
}/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * :: boolean_parsed_as($string) :: * returns the equivalent boolean statement in user readable form * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */function boolean_parsed_as($string){ $result = boolean_mark_atoms($string);/* dispatch 'foo[(%)]bar' to empty string */
$result=str_replace("foo[('","",$result); $result=str_replace("')]bar","",$result);return $result;
}?>
The code of a simple example implementation complete with sample data:
example.mysql.boolean.php<?php/* * * * example.mysql.boolean.php * * * * * * * * * * * * * * * * * * * * *
* * The following file contains sample data to demonstrate the * capability of the functions contained in funcs.mysql.boolean.php: * http://davidaltherr.net/web/php_functions/ * boolean/funcs.mysql.boolean.txt * To see the example, load the data into MySQL then run this script * * Copyright 2001 David Altherr * altherda@email.uc.edu * www.davidaltherr.net * * All material granted free for use under MIT general public license * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *//* * * * Example Implementation * * * * * * * * * * * * * * * * * * * * *
// MySQL Database //
// architecture //
CREATE DATABASE news; USE news;CREATE TABLE quotes (
id int(9) NOT NULL auto_increment, author varchar(255), content text, PRIMARY KEY (id), UNIQUE KEY id (id), FULLTEXT KEY author (author,content) ) TYPE=MyISAM;// data //
INSERT INTO quotes VALUES( 10000,'George Stephanopolous', 'The President has kept all the promises he intended to keep.'); INSERT INTO quotes VALUES( 10001,'Dan Quayle', 'It is wonderful to be here in the great state of Chicago.'); INSERT INTO quotes VALUES( 10002,'Marion Barry', 'Outside of the killings, Washington has one of the lowest crime rates in the country.'); INSERT INTO quotes VALUES( 10003,'David Dinkins', 'I haven\'t committed a crime. What I did was fail to comply with the law.'); INSERT INTO quotes VALUES( 10004,'Dan Quayle', 'It isn\'t pollution that\'s harming the environment. It\'s the impurities in our air and water that are doing it.'); INSERT INTO quotes VALUES( 10005,'George Dubya', 'One word sums up probably the responsibility of any Governor, and that one word is \' to be prepared \'.'); INSERT INTO quotes VALUES( 10006,'George Dubya', 'The most important job is not to be Governor, or First Lady in my case.'); * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // PHP Script //// form input //
$table_name = 'news.quotes'; $search_string = 'george (governor,president) -responsibility';// database connection //
$db_host = 'localhost'; $db_user = 'username'; $db_pwd = 'password'; $db_connect = mysql_connect($db_host,$db_user,$db_pwd) or die(mysql_error());// sql construction //
require_once('funcs.mysql.boolean.php'); $fulltext_key = get_fulltext_key($table_name,$db_connect); $sql = "SELECT id, author, content, " .boolean_sql_select( boolean_inclusive_atoms($search_string), $fulltext_key)." as relevance " ."FROM $table_name " ."WHERE " .boolean_sql_where($search_string,$fulltext_key)." " ."HAVING relevance>0 " ."ORDER BY relevance DESC ";// data query //
$result = mysql_query($sql,$db_connect) or die(mysql_error()); $result_rows = mysql_num_rows($result);// get results //
$output = " <table border=1> <thead> <tr> <th>id</th> <th>author</th> <th>content</th> <th>relevance</th> </tr> </thead> <tbody>"; for($ith=0;$ith<$result_rows;$ith++){ $ir=mysql_fetch_row($result); $output .= " <tr> <td>$ir[0] </td> <td> $ir[1] </td> <td> $ir[2] </td> <td> $ir[3]</td> </tr>"; } $output .= " </tbody> </table>";// get user readable statement //
$parsed_as = boolean_parsed_as($search_string);// display process //
echo "<h5>Input Statement</h5>" ."<p>$search_string</p>" ."<h5>Parsed As</h5>" ."<p>$parsed_as</p>" ."<h5>SQL Generated</h5>" ."<p>".nl2br($sql)."</p>" ."<h5>Query Results</h5>" ."<p>$output</p>";?>
SELECT
and WHERE
clauses as follows. First, we must retrieve the FULLTEXT KEY
column names from our table, $table_name
, via our database connection, $db_connect
:$fulltext_key
and the user input, $search_string
:
AND
OR
NOT
(
)
AND
OR
NOT
AND
operator has precedence over the OR
operator.
myisam/ftdefs.h
file: #define MIN_WORD_LEN 4
If you need more functionality and don't want to go through the trouble of recompiling MySQL, no worry: the code is currently written to be adaptive to words of three or less characters. For a given word, the SQL element in the SELECT
clause will switch to a simple scoring algorithm that does a case insensitive count of the subject word in all of the FULLTEXT KEY
columns without the use of the fulltext index; the SQL element in the WHERE
clause will switch to a LIKE
string comparison of the subject word with all of the FULLTEXT KEY
columns. Note that enabling this additional functionality may create some performance issues with larger datasets as it runs significantly slower than the fulltext system used for words of 4+ characters, perhaps because it does not use a compiled algorithm or the fulltext index.
atom1 AND atom2
atom1 OR atom2
atom1 NOT atom2
atom1 (atom2,atom3) -atom4
(atom1,atom2) -(atom1 atom2)
[construct for atom1 XOR atom2
]atom1 (atom2,(atom3 -atom4)) -atom5
WHERE
Clausetitle,content
indexed on the FULLTEXT KEY
, the typical syntax for implementing a fulltext search on said table might be: MATCH (title,content) AGAINST ('george')
If the fulltext index contains the word george
, the above will return a floating point number, typically between 0.0 (exclusive) and 5.0 for any given record. If the fulltext index does not contain the word george
, the above will return 0.0 for any given record. Thus, we can make the above statement evaluate to TRUE or FALSE with the following syntax:
MATCH (title,content) AGAINST ('george') > 0
In order to facilitate boolean capability for an expression like
george AND dubya
the corresponding SQL code in the WHERE
clause will be
SELECT
ClauseSELECT
clause can be used to calculate the total relevance as declared by the relevance
alias for the above example like so:
but the code utilizes a simplification which produces a numerical equivalent:
HAVING
ClauseHAVING
clause which will ensure that the query only returns records with a non-zero relevance: relevance > 0
While this may seem redundant considering the SQL in the WHERE
clause which already deals which the inclusion and exclusion of records, this allows the developer to impose a minimum relevance restriction on the result set.
relevance > 0.2
ORDER BY
Clause relevance DESC
-- David Altherr