<?php if (!defined('PmWiki')) exit();

/* extract.php, an extension for PmWiki 2.2, copyright Hans Bracker 2008. 
	a general regex processor for extracting text from multiple pages 
	using regular expressions and wildcard pagename patterns.
	
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published
   by the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
   
   See Cookbook:TextExtract for documentation and instructions.
   
	Syntax:  {(extract Pattern PageName [PageName2] [PageName3] ... [keyword=value] ...)}
	Pattern - display lines matching regex pattern. Pattern must be the first argument. 
				A dot '.' would include all of the page text.
	PageName - source pages from PageName or Group. 
				 Allowed are Wiki wildcards * and ? OR PageName#section.
	-PageName - PageName excluded from source pagelist, wildcards allowed.

	Options: 
	group=GROUP - source pages from GROUP (same as pagelist group= option) (wiki wildcards and comma-separated listing allowed)
	name=NAME - source pages NAME from any group (same as pagelist name= option) (wiki wildcards and comma-separated listing allowed)	
	prefix=link - display line with page link above extract
	prefix=linkmod - display line with page link and 'modified by' link and modified time above extract
	prefix=STRING - display line with STRING above each page extract
	suffix=STRING - display line with STRING below each page extract
	cut=PATTERN - do not display lines matching PATTERN
	count=n - include only n number of pages in the output.
	lines=n - display first n lines
	lines=-n - display last n lines
	lines=n..m - display lines from line n to line m (including line m)
	lines=n.. - display lines from line n till end
	snip=PATTERN - do not display text matching PATTERN, remove it from the line
	highlight=off - do not use match highlighting. Default is 'bold' highlighting, 
						apart from off when textpattern is '.' (included whole page text)
	unit=line - default: matching row (line) is shown.
	unit=para - whole paragraph is shown (separated by empty lines or headings)
	unit=page - whole page text is shown.
	markup=cut - lines with directives will be ignored.
	markup=on - directives will be active, but only if pattern is '.' or unit=page.
	markup=code - Default: lines including directives will be shown as source code.
	case=1 - do case-sensitive search. Default is 0 insensitive case search.
	header=STRING - display STRING on first line. 
	header=count - display 'Results: count' on first line. 
	header=full - display extended result count on first line plus a footer to mark end.
*/
$RecipeInfo['TextExtract']['Version'] = '2008-03-07';

// defaults for extractor search form
SDVA($ExtractFormOpt, array(
    'size'   		=> '30', 
    'button'  		=> FmtPageName('$[Search]', $pagename),
    'searchlabel' => FmtPageName('$[Search for]', $pagename),
    'pageslabel' 	=> FmtPageName('$[On pages]', $pagename),
    'caselabel' 	=> FmtPageName('$[Case sensitive search]', $pagename),
    'prefix' 		=> 'link',
    'header' 		=> 'full',
));

// defaults array
SDVA($TextExtractOpt, array (
	'markup' 	=> 'code',
	'unit'   	=> 'line',
	'highlight' => 'bold',
	'case'   	=> 0,
	'error'		=> 1,
));

// markup expression {(extract pattern pagename ...)}
$MarkupExpr['extract'] = 'TextExtract($pagename, @$args, @$argp)';

// markup (:extract ....:)
Markup('extractform', '>links','/\\(:extract(\\s.*?)?:\\)/e',
		"ExtractFormMarkup(\$pagename, ParseArgs(PSS('$1')))");
  
// markup (:extractresult:) 
Markup('extractresult','directives','/\\(:extractresult(s)?:\\)/e',
		"MarkupExtractResults(\$pagename)");
function MarkupExtractResults($pagename) {		
		$out = MarkupToHTML($pagename, $GLOBALS['ExtractDisplayFmt']);
		//srip p tags from beginning and end, trim end space
		$out = rtrim(preg_replace("/^<p>(.*?)<\\/p>$/s","$1", $out));
		return Keep($out);
}		

// ?action=extract calls function HandleExtract()
$HandleActions['extract'] = 'HandleExtract'; 


// extractor search form
function ExtractFormMarkup($pagename, $opt) {
	global $ExtractFormOpt, $EnablePathInfo;
	$opt = array_merge($ExtractFormOpt, $opt);
	$opt['action'] = 'extract';
	if(isset($opt['target'])) $target = MakePageName($pagename, $opt['target']); 
	else $target = $pagename;
	$out = FmtPageName(" class='wikisearch' action='\$PageUrl' method='get'>", $target);
	$opt['n'] = IsEnabled($EnablePathInfo, 0) ? '' : $target;
	$out .= "\n<table>";
	$out .= 
		"<tr><td>{$opt['searchlabel']} </td><td><input type='text' name='pattern' value='{$opt['pattern']}' class='inputbox searchbox' size='{$opt['size']}' /> </td></tr> \n";
	if (!isset($opt['page'])) {
		$out .=
		"<tr><td>{$opt['pageslabel']}</td><td><input type='text' name='page' value='{$opt['page']}' class='inputbox searchbox' size='{$opt['size']}' /> </td></tr> \n";
	}
	$out .= "<tr><td colspan='2'>{$opt['caselabel']}<input type='checkbox' name='case' value='1' />";
	$out .= "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <input type='submit' class='inputbutton searchbutton' value='{$opt['button']}' /></td></tr></table> \n";

	foreach($opt as $k => $v) {
		if ($v == '' || is_array($v)) continue;
		if ($k=='q' || $k=='label' || $k=='value' || $k=='size' || $k=='patlabel' || $k=='pagelabel') continue;
		$k = str_replace("'", "&#039;", $k);
		$v = str_replace("'", "&#039;", $v);
		$out .= 
		"<input type='hidden' name='$k' value='$v' /> \n";
	}
	return "<form ".Keep($out)."</form>";
}

// action=extract handler
function HandleExtract($pagename) {
	global $ExtractDisplayFmt;
	$req = RequestArgs();
	$args[] = $req['pattern'];
	$pgs1 = preg_split("/[\s,|]+/", $req['page'], -1, PREG_SPLIT_NO_EMPTY);
	$pgs2 = preg_split("/[\s,|]+/", $req['page2'], -1, PREG_SPLIT_NO_EMPTY);
	$args = array_merge($args, $pgs1, $pgs2);
	$text = TextExtract($pagename, $args, $req);
	$ExtractDisplayFmt = $text;
	HandleDispatch($pagename,'browse','');
}

// main text extract processing
function TextExtract($pagename, $args, $opt = NULL) {
	global $TextExtractOpt, $TextExtract, $FmtV, $FmtPV;
	if ($args[0]=='' && $args[1]=='') return '';
	//start time
	#$stime = strtok(microtime(), ' ') + strtok('');

	//clean input options, set default options
	unset($opt[''],$opt['#']);
	foreach($opt as $i => $v)
		if ($v=='' || strstr($v,'{$$')) 
			unset($opt[$i]);
	$opt = array_merge($TextExtractOpt, $opt);

	//use first argument as regex text pattern
	$pat = array_shift($args);
	
	$exclpat = array("*","?","+","(",")","[","]","^","$","|","/","??","\\","//");
	foreach($exclpat as $v)
		if($pat==$v) return '%red%Error: disallowed character input!';

	//check & use query from PmWiki searchbox
	if(isset($FmtV['$Needle']) && $FmtV['$Needle']>0) {
		$pat = ltrim($FmtV['$Needle']);
		$opt['error'] = 0;
	}	

	//case insensitive search
	if (@$opt['case']==1) $qi = '';
	else $qi = 'i';
	
	//suppress highlight for 'include everything' pattern
	if($pat=='.') $opt['highlight'] = '';
	
	//create source pagelist
	$grp = PageVar($pagename, '$Group');
	$exlist =	array();
	$srclist = array();
	$plist = array();
	
	if (isset($opt['group']))
		$args[] = "{$opt['group']}.*";
	if (isset($opt['name']))
		$args[] = "*.{$opt['name']}";
	
	foreach($args as $src) {
		$pgs = preg_split("/[\s,|]+/", $src, -1, PREG_SPLIT_NO_EMPTY);
		$plist = array_merge($plist, $pgs);
	}

	foreach($plist as $src) {
		//check for exclusions
		if($src{0}=='-') {
			$src = substr($src, 1);
			//check for group.name pattern
			if (strstr($src,'.')) 
				$pgpat = $src;
			else $pgpat = $grp.".".$src;
			//make preg pattern from wildcard pattern
			$prpat = GlobToPCRE(FixGlob($pgpat));
			//make list from preg name pattern
			$exlist = array_merge($exlist, ListPages("/$prpat[0]/"));			
		}
		//check for section suffix
		elseif (strstr($src,'#')) 
			$srclist[] = $src;
		//additions
		else { 
			//check for group.name pattern
			if (strstr($src,'.')) 
				$pgpat = $src;
			else $pgpat = $grp.".".$src;
			//make preg pattern from wildcard pattern
			$prpat = GlobToPCRE(FixGlob($pgpat));
			//make list from preg name pattern
			$srclist =	array_merge($srclist, ListPages("/$prpat[0]/"));
		}
	}
	$sourcelist = array_diff($srclist, $exlist);
	
	//don't order list if (pagelist ) input. (pagelist order= will be honoured)
	if (!isset($FmtV['$Needle'])) 
		sort($sourcelist);
		
	$sourcecount = 0; #count($sourcelist);
	$pagecount = 0;
/* DEBUG// ?><pre><?php echo "\$sourcelist "; print_r($sourcelist); ?></pre><?php */

	//process each source page in turn
	$newrows = array(); 
	foreach($sourcelist as $source) {
		if ($source==$pagename) continue;
		if (strstr($source,'#'))
			$srcname = MakePageName($pagename, $source);
		else $srcname = $source;
		$page = RetrieveAuthPage($srcname, 'read', true);
		if ($page) {
			$pf = 0;  //init prefix marker
		  	$text = $page['text'];
			//use pagename#section if present
			if(strstr($source,'#'))
				$text = TextSection($text, $source);
			$text = Qualify($srcname, $text);
			$textrows = explode("\n", rtrim($text));

			if($opt['lines']!='') {
					$cnt = count($textrows);
					if(strstr($opt['lines'],'..')) {
						preg_match_all("/\d*/", $lines, $k);
						$a=$k[0][0];  $b=$k[0][3]; $c=$k[0][2];
						if($a && $b) 
							$textrows = array_slice($textrows, $a-1, $b-$a+1);
						else if($a)
							$textrows = array_slice($textrows, $a-1);
						else if($c)
							$textrows = array_slice($textrows, 0, $c);
					}		
					else if($opt['lines']{0}=='-')
						$textrows = array_slice($textrows, $opt['lines']);
					else $textrows = array_slice($textrows, 0, $opt['lines']); 
			}
			if ($opt['unit']=='para') {
				$paras = array();
				$hds = preg_split("/\n!/", $text, PREG_SPLIT_OFFSET_CAPTURE);
				foreach($hds as $i=>$h) {
					if ($i=='0') continue;
					$hds[$i] = "!".$h;
				}
				foreach($hds as $i=>$h) {
					$par[$i] = preg_split("/(\n\n)/", $h, PREG_SPLIT_OFFSET_CAPTURE);
					$paras = array_merge($paras, $par[$i]);
				}	
				$textrows = $paras;
				##DEBUG## echo "<pre> \$paras "; print_r($paras); echo "</pre>";			
			}
			foreach ($textrows as $row) {
				//skip pages which don't match
				if ($opt['unit']=='page' && !preg_match("/($pat)/".$qi, $text)) continue; 
				//preserve empty rows for 'all including' pattern
				if ($opt['unit']=='line' && $row=="" && $pat==".") { $newrows[] = $row; continue;  }
				//skip rows which don't match
				if ($opt['unit']=='line' && !preg_match("/($pat)/".$qi, $row)) continue; 
				if ($opt['unit']=='para' && !preg_match("/($pat)/".$qi, $row)) continue; 
				//process lines containing markup directives
				switch($opt['markup']) {
					case 'cut': 
						if(preg_match("/\\(:/", $row, $mc)) continue 2;
					case 'on': 
						if ($pat=="." || $opt['unit']=='page') break;
					case 'code':
					#	$row = str_replace("(:redirect ", "(&#x3a;redirect&nbsp;", $row);
						$row = str_replace("<","&lt;",$row);
						$row = str_replace(">","&gt;",$row);
						$row = preg_replace("/^(!+)/",'!!!!!' , $row);
						$row = preg_replace("/\\{(\\(\\w+\\b.*?\\))\\}/", "&#123;$1&#125;", $row); 
						$row = str_replace("(:","(&#x3a;",$row);
						if(preg_match("/(\\[[@|=])|([@|=]\\])/", $row, $m00)) {
							$aa = preg_match_all("/\\[@/", $row, $maa);
							$bb = preg_match_all("/\\@]/", $row, $mbb);
							if ($aa>$bb) $row = $row."@]";	
							if ($aa<$bb) $row = "[@".$row;
							if (preg_match_all("/(\\[[@|=])[^\\]]*(\\(&#x3a;)(.*?[@|=]\\])/", $row, $mp)) {
								foreach($mp[0] as $i => $v) {
									$v = str_replace("(&#x3a;","(:",$v);
									$row = str_replace( $mp[0][$i], $v, $row);
								}
							}
						}	 
						break;
				}
				//exclude lines containing matches with cut pattern
				if ($opt['cut']!='')
					if(preg_match("/({$opt['cut']})/".$qi, $row)) continue;
				$row = ltrim($row);
				//check for & add prefix line
				if($opt['prefix']!='' && $pf==0) {
					if($opt['prefix']=='link' ) {
						$newrows[] = "(:spacer:)\\\\";
						$newrows[] = "'''[[$source]]'''(:spacer:)";
					}
					elseif($opt['prefix']=='linkmod' ) {
						$newrows[] = "(:spacer:)\\\\";
						$lmod = PageVar($srcname,'$LastModified');
						$lmby = PageVar($srcname,'$LastModifiedBy');
						$newrows[] = "%rfloat%''last modified by [[~{$lmby}]] on {$lmod}'' %left%'''[[$source]]'''(:spacer:)";
					}
					else $newrows[] = $opt['prefix'];
				}
				//check if textrow needs processing
				if($opt['snip']!='') 
					$row = preg_replace("/{$opt['snip']}/", '', $row);
				//highlighting matches
				if($opt['highlight']=='bold') {
						if(preg_match_all("/(\\[[@|=])([^\\]]*)($pat)(.*?)([@|=]\\])/".$qi, $row, $mb)) {
							foreach($mb[0] as $i => $v) {
								if (strstr($v, "(&#x3a;"))
									$v = str_replace("&#x3a;",":",$v);
								if ($mb[1][$i]=='[@')
									$mbnew = preg_replace("/($pat)/".$qi, "@]'''$1'''[@", $v);
								else 
								if ($mb[1][$i]=='[=')
									$mbnew = preg_replace("/($pat)/".$qi, "=] '''$1''' [=", $v);
								$row = str_replace( $mb[0][$i], $mbnew, $row);
							}
						}
						else
							$row = preg_replace("/($pat)/".$qi,"'''$1'''", $row);
				}
				$newrows[] = $row;
				if ($opt['unit']=='para') $newrows[] = "(:spacer:)";
				$pf = 1; //set prefix marker
			}
		}
		$sourcecount++;
		if ($opt['suffix']!='') $newrows[] = $suffix;
		if($pf==1) $pagecount++;
		if (isset($opt['count']))
			if ($pagecount == $opt['count']) break;
	}
	$newrows[] = "(:spacer:)";
	//output text from array of rows
	$text = implode("\n", $newrows);

	//add header with result counter & timer
	switch($opt['header']) {
		default: 
			$text = "{$opt['header']}\n".$text;
			break;
		case 'count':
		case 'counter':
			//count matches
			$cnt = preg_match_all("/($pat)/".$qi, $text, $m);
			$text = "'''Results: ".$cnt."''' \n----\n".$text;
			break;
		case 'all':
		case 'full':
			//count matches
			$cnt = preg_match_all("/($pat)/".$qi, $text, $m);
			//timer
			#$wtime = strtok(microtime(), ' ') + strtok('') - $stime;
			#$xtime = sprintf("%04.2f %s", $wtime, ''); //time in secs
			if ($pagecount>1)
				$from = "on {$pagecount} pages from {$sourcecount} pages searched";
			if ($pagecount==1)
				$from = "from {$sourcecount} pages searched";
			$text = "----\n[[#extracttop]]%lfloat%'''Text Extract''' %right%'''{$cnt} results''' &nbsp;&nbsp; {$from}\n----\n".$text;
			$opt['footer'] = "\\\\\n\n----\n%center%'''End of Text Extract''' &nbsp;&nbsp; [[#extracttop|(start)]] \n----";
			break;
	}
	if ($opt['footer'] && $pagecount>0) {
			$text .= "\n{$opt['footer']}";
	}
	if($pagecount==0 && $opt['error']==1) {
			$error = "\n%red%Found no matching pages!%%";
			$text = $text.$error;
	}
	return $text;
}

//empty spacer div, normalise space for nolinebreaks and linebreaks conditions
Markup('spacer', '<restore', 
  '/\\(:spacer:\\)/e', 
  "SpacerMarkup(\$pagename)");

function SpacerMarkup($pagename) {
	global $HTMLPNewline;
	if ($HTMLPNewline != '')
		return '';
	else 
		return "<div class='spacer'><!-- a spacer division --></div>";
}

// to remove (:spacer:) markup, for use when writing output to  a page, 
// use in form template with {$$cleanspacer (extract ....))}
$MarkupExpr['cleanspacer'] = 'CleanSpacer($pagename, @$args[0])';
function CleanSpacer($pagename, $text) {
	$text = preg_replace("/\\(:spacer:\\)\\\*/", "", $text);
	return $text;
}