<?php if (!defined('PmWiki')) exit(); /* extract.php, an extension for PmWiki 2.2, copyright Hans Bracker 2008. a general regex processor for extracting text from multiple pages using regular expressions and wildcard pagename patterns. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. See Cookbook:TextExtract for documentation and instructions. Syntax: {(extract Pattern PageName [PageName2] [PageName3] ... [keyword=value] ...)} Pattern - display lines matching regex pattern. Pattern must be the first argument. A dot '.' would include all of the page text. PageName - source pages from PageName or Group. Allowed are Wiki wildcards * and ? OR PageName#section. -PageName - PageName excluded from source pagelist, wildcards allowed. Options: group=GROUP - source pages from GROUP (same as pagelist group= option) (wiki wildcards and comma-separated listing allowed) name=NAME - source pages NAME from any group (same as pagelist name= option) (wiki wildcards and comma-separated listing allowed) prefix=link - display line with page link above extract prefix=linkmod - display line with page link and 'modified by' link and modified time above extract prefix=STRING - display line with STRING above each page extract suffix=STRING - display line with STRING below each page extract cut=PATTERN - do not display lines matching PATTERN count=n - include only n number of pages in the output. lines=n - display first n lines lines=-n - display last n lines lines=n..m - display lines from line n to line m (including line m) lines=n.. - display lines from line n till end snip=PATTERN - do not display text matching PATTERN, remove it from the line highlight=off - do not use match highlighting. Default is 'bold' highlighting, apart from off when textpattern is '.' (included whole page text) unit=line - default: matching row (line) is shown. unit=para - whole paragraph is shown (separated by empty lines or headings) unit=page - whole page text is shown. markup=cut - lines with directives will be ignored. markup=on - directives will be active, but only if pattern is '.' or unit=page. markup=code - Default: lines including directives will be shown as source code. case=1 - do case-sensitive search. Default is 0 insensitive case search. header=STRING - display STRING on first line. header=count - display 'Results: count' on first line. header=full - display extended result count on first line plus a footer to mark end. */ $RecipeInfo['TextExtract']['Version'] = '2008-03-07'; // defaults for extractor search form SDVA($ExtractFormOpt, array( 'size' => '30', 'button' => FmtPageName('$[Search]', $pagename), 'searchlabel' => FmtPageName('$[Search for]', $pagename), 'pageslabel' => FmtPageName('$[On pages]', $pagename), 'caselabel' => FmtPageName('$[Case sensitive search]', $pagename), 'prefix' => 'link', 'header' => 'full', )); // defaults array SDVA($TextExtractOpt, array ( 'markup' => 'code', 'unit' => 'line', 'highlight' => 'bold', 'case' => 0, 'error' => 1, )); // markup expression {(extract pattern pagename ...)} $MarkupExpr['extract'] = 'TextExtract($pagename, @$args, @$argp)'; // markup (:extract ....:) Markup('extractform', '>links','/\\(:extract(\\s.*?)?:\\)/e', "ExtractFormMarkup(\$pagename, ParseArgs(PSS('$1')))"); // markup (:extractresult:) Markup('extractresult','directives','/\\(:extractresult(s)?:\\)/e', "MarkupExtractResults(\$pagename)"); function MarkupExtractResults($pagename) { $out = MarkupToHTML($pagename, $GLOBALS['ExtractDisplayFmt']); //srip p tags from beginning and end, trim end space $out = rtrim(preg_replace("/^<p>(.*?)<\\/p>$/s","$1", $out)); return Keep($out); } // ?action=extract calls function HandleExtract() $HandleActions['extract'] = 'HandleExtract'; // extractor search form function ExtractFormMarkup($pagename, $opt) { global $ExtractFormOpt, $EnablePathInfo; $opt = array_merge($ExtractFormOpt, $opt); $opt['action'] = 'extract'; if(isset($opt['target'])) $target = MakePageName($pagename, $opt['target']); else $target = $pagename; $out = FmtPageName(" class='wikisearch' action='\$PageUrl' method='get'>", $target); $opt['n'] = IsEnabled($EnablePathInfo, 0) ? '' : $target; $out .= "\n<table>"; $out .= "<tr><td>{$opt['searchlabel']} </td><td><input type='text' name='pattern' value='{$opt['pattern']}' class='inputbox searchbox' size='{$opt['size']}' /> </td></tr> \n"; if (!isset($opt['page'])) { $out .= "<tr><td>{$opt['pageslabel']}</td><td><input type='text' name='page' value='{$opt['page']}' class='inputbox searchbox' size='{$opt['size']}' /> </td></tr> \n"; } $out .= "<tr><td colspan='2'>{$opt['caselabel']}<input type='checkbox' name='case' value='1' />"; $out .= " <input type='submit' class='inputbutton searchbutton' value='{$opt['button']}' /></td></tr></table> \n"; foreach($opt as $k => $v) { if ($v == '' || is_array($v)) continue; if ($k=='q' || $k=='label' || $k=='value' || $k=='size' || $k=='patlabel' || $k=='pagelabel') continue; $k = str_replace("'", "'", $k); $v = str_replace("'", "'", $v); $out .= "<input type='hidden' name='$k' value='$v' /> \n"; } return "<form ".Keep($out)."</form>"; } // action=extract handler function HandleExtract($pagename) { global $ExtractDisplayFmt; $req = RequestArgs(); $args[] = $req['pattern']; $pgs1 = preg_split("/[\s,|]+/", $req['page'], -1, PREG_SPLIT_NO_EMPTY); $pgs2 = preg_split("/[\s,|]+/", $req['page2'], -1, PREG_SPLIT_NO_EMPTY); $args = array_merge($args, $pgs1, $pgs2); $text = TextExtract($pagename, $args, $req); $ExtractDisplayFmt = $text; HandleDispatch($pagename,'browse',''); } // main text extract processing function TextExtract($pagename, $args, $opt = NULL) { global $TextExtractOpt, $TextExtract, $FmtV, $FmtPV; if ($args[0]=='' && $args[1]=='') return ''; //start time #$stime = strtok(microtime(), ' ') + strtok(''); //clean input options, set default options unset($opt[''],$opt['#']); foreach($opt as $i => $v) if ($v=='' || strstr($v,'{$$')) unset($opt[$i]); $opt = array_merge($TextExtractOpt, $opt); //use first argument as regex text pattern $pat = array_shift($args); $exclpat = array("*","?","+","(",")","[","]","^","$","|","/","??","\\","//"); foreach($exclpat as $v) if($pat==$v) return '%red%Error: disallowed character input!'; //check & use query from PmWiki searchbox if(isset($FmtV['$Needle']) && $FmtV['$Needle']>0) { $pat = ltrim($FmtV['$Needle']); $opt['error'] = 0; } //case insensitive search if (@$opt['case']==1) $qi = ''; else $qi = 'i'; //suppress highlight for 'include everything' pattern if($pat=='.') $opt['highlight'] = ''; //create source pagelist $grp = PageVar($pagename, '$Group'); $exlist = array(); $srclist = array(); $plist = array(); if (isset($opt['group'])) $args[] = "{$opt['group']}.*"; if (isset($opt['name'])) $args[] = "*.{$opt['name']}"; foreach($args as $src) { $pgs = preg_split("/[\s,|]+/", $src, -1, PREG_SPLIT_NO_EMPTY); $plist = array_merge($plist, $pgs); } foreach($plist as $src) { //check for exclusions if($src{0}=='-') { $src = substr($src, 1); //check for group.name pattern if (strstr($src,'.')) $pgpat = $src; else $pgpat = $grp.".".$src; //make preg pattern from wildcard pattern $prpat = GlobToPCRE(FixGlob($pgpat)); //make list from preg name pattern $exlist = array_merge($exlist, ListPages("/$prpat[0]/")); } //check for section suffix elseif (strstr($src,'#')) $srclist[] = $src; //additions else { //check for group.name pattern if (strstr($src,'.')) $pgpat = $src; else $pgpat = $grp.".".$src; //make preg pattern from wildcard pattern $prpat = GlobToPCRE(FixGlob($pgpat)); //make list from preg name pattern $srclist = array_merge($srclist, ListPages("/$prpat[0]/")); } } $sourcelist = array_diff($srclist, $exlist); //don't order list if (pagelist ) input. (pagelist order= will be honoured) if (!isset($FmtV['$Needle'])) sort($sourcelist); $sourcecount = 0; #count($sourcelist); $pagecount = 0; /* DEBUG// ?><pre><?php echo "\$sourcelist "; print_r($sourcelist); ?></pre><?php */ //process each source page in turn $newrows = array(); foreach($sourcelist as $source) { if ($source==$pagename) continue; if (strstr($source,'#')) $srcname = MakePageName($pagename, $source); else $srcname = $source; $page = RetrieveAuthPage($srcname, 'read', true); if ($page) { $pf = 0; //init prefix marker $text = $page['text']; //use pagename#section if present if(strstr($source,'#')) $text = TextSection($text, $source); $text = Qualify($srcname, $text); $textrows = explode("\n", rtrim($text)); if($opt['lines']!='') { $cnt = count($textrows); if(strstr($opt['lines'],'..')) { preg_match_all("/\d*/", $lines, $k); $a=$k[0][0]; $b=$k[0][3]; $c=$k[0][2]; if($a && $b) $textrows = array_slice($textrows, $a-1, $b-$a+1); else if($a) $textrows = array_slice($textrows, $a-1); else if($c) $textrows = array_slice($textrows, 0, $c); } else if($opt['lines']{0}=='-') $textrows = array_slice($textrows, $opt['lines']); else $textrows = array_slice($textrows, 0, $opt['lines']); } if ($opt['unit']=='para') { $paras = array(); $hds = preg_split("/\n!/", $text, PREG_SPLIT_OFFSET_CAPTURE); foreach($hds as $i=>$h) { if ($i=='0') continue; $hds[$i] = "!".$h; } foreach($hds as $i=>$h) { $par[$i] = preg_split("/(\n\n)/", $h, PREG_SPLIT_OFFSET_CAPTURE); $paras = array_merge($paras, $par[$i]); } $textrows = $paras; ##DEBUG## echo "<pre> \$paras "; print_r($paras); echo "</pre>"; } foreach ($textrows as $row) { //skip pages which don't match if ($opt['unit']=='page' && !preg_match("/($pat)/".$qi, $text)) continue; //preserve empty rows for 'all including' pattern if ($opt['unit']=='line' && $row=="" && $pat==".") { $newrows[] = $row; continue; } //skip rows which don't match if ($opt['unit']=='line' && !preg_match("/($pat)/".$qi, $row)) continue; if ($opt['unit']=='para' && !preg_match("/($pat)/".$qi, $row)) continue; //process lines containing markup directives switch($opt['markup']) { case 'cut': if(preg_match("/\\(:/", $row, $mc)) continue 2; case 'on': if ($pat=="." || $opt['unit']=='page') break; case 'code': # $row = str_replace("(:redirect ", "(:redirect ", $row); $row = str_replace("<","<",$row); $row = str_replace(">",">",$row); $row = preg_replace("/^(!+)/",'!!!!!' , $row); $row = preg_replace("/\\{(\\(\\w+\\b.*?\\))\\}/", "{$1}", $row); $row = str_replace("(:","(:",$row); if(preg_match("/(\\[[@|=])|([@|=]\\])/", $row, $m00)) { $aa = preg_match_all("/\\[@/", $row, $maa); $bb = preg_match_all("/\\@]/", $row, $mbb); if ($aa>$bb) $row = $row."@]"; if ($aa<$bb) $row = "[@".$row; if (preg_match_all("/(\\[[@|=])[^\\]]*(\\(:)(.*?[@|=]\\])/", $row, $mp)) { foreach($mp[0] as $i => $v) { $v = str_replace("(:","(:",$v); $row = str_replace( $mp[0][$i], $v, $row); } } } break; } //exclude lines containing matches with cut pattern if ($opt['cut']!='') if(preg_match("/({$opt['cut']})/".$qi, $row)) continue; $row = ltrim($row); //check for & add prefix line if($opt['prefix']!='' && $pf==0) { if($opt['prefix']=='link' ) { $newrows[] = "(:spacer:)\\\\"; $newrows[] = "'''[[$source]]'''(:spacer:)"; } elseif($opt['prefix']=='linkmod' ) { $newrows[] = "(:spacer:)\\\\"; $lmod = PageVar($srcname,'$LastModified'); $lmby = PageVar($srcname,'$LastModifiedBy'); $newrows[] = "%rfloat%''last modified by [[~{$lmby}]] on {$lmod}'' %left%'''[[$source]]'''(:spacer:)"; } else $newrows[] = $opt['prefix']; } //check if textrow needs processing if($opt['snip']!='') $row = preg_replace("/{$opt['snip']}/", '', $row); //highlighting matches if($opt['highlight']=='bold') { if(preg_match_all("/(\\[[@|=])([^\\]]*)($pat)(.*?)([@|=]\\])/".$qi, $row, $mb)) { foreach($mb[0] as $i => $v) { if (strstr($v, "(:")) $v = str_replace(":",":",$v); if ($mb[1][$i]=='[@') $mbnew = preg_replace("/($pat)/".$qi, "@]'''$1'''[@", $v); else if ($mb[1][$i]=='[=') $mbnew = preg_replace("/($pat)/".$qi, "=] '''$1''' [=", $v); $row = str_replace( $mb[0][$i], $mbnew, $row); } } else $row = preg_replace("/($pat)/".$qi,"'''$1'''", $row); } $newrows[] = $row; if ($opt['unit']=='para') $newrows[] = "(:spacer:)"; $pf = 1; //set prefix marker } } $sourcecount++; if ($opt['suffix']!='') $newrows[] = $suffix; if($pf==1) $pagecount++; if (isset($opt['count'])) if ($pagecount == $opt['count']) break; } $newrows[] = "(:spacer:)"; //output text from array of rows $text = implode("\n", $newrows); //add header with result counter & timer switch($opt['header']) { default: $text = "{$opt['header']}\n".$text; break; case 'count': case 'counter': //count matches $cnt = preg_match_all("/($pat)/".$qi, $text, $m); $text = "'''Results: ".$cnt."''' \n----\n".$text; break; case 'all': case 'full': //count matches $cnt = preg_match_all("/($pat)/".$qi, $text, $m); //timer #$wtime = strtok(microtime(), ' ') + strtok('') - $stime; #$xtime = sprintf("%04.2f %s", $wtime, ''); //time in secs if ($pagecount>1) $from = "on {$pagecount} pages from {$sourcecount} pages searched"; if ($pagecount==1) $from = "from {$sourcecount} pages searched"; $text = "----\n[[#extracttop]]%lfloat%'''Text Extract''' %right%'''{$cnt} results''' {$from}\n----\n".$text; $opt['footer'] = "\\\\\n\n----\n%center%'''End of Text Extract''' [[#extracttop|(start)]] \n----"; break; } if ($opt['footer'] && $pagecount>0) { $text .= "\n{$opt['footer']}"; } if($pagecount==0 && $opt['error']==1) { $error = "\n%red%Found no matching pages!%%"; $text = $text.$error; } return $text; } //empty spacer div, normalise space for nolinebreaks and linebreaks conditions Markup('spacer', '<restore', '/\\(:spacer:\\)/e', "SpacerMarkup(\$pagename)"); function SpacerMarkup($pagename) { global $HTMLPNewline; if ($HTMLPNewline != '') return ''; else return "<div class='spacer'><!-- a spacer division --></div>"; } // to remove (:spacer:) markup, for use when writing output to a page, // use in form template with {$$cleanspacer (extract ....))} $MarkupExpr['cleanspacer'] = 'CleanSpacer($pagename, @$args[0])'; function CleanSpacer($pagename, $text) { $text = preg_replace("/\\(:spacer:\\)\\\*/", "", $text); return $text; }