<?php if (!defined('PmWiki')) exit();
/* extract.php, an extension for PmWiki 2.2, copyright Hans Bracker 2009. 
   a general regex processor for extracting text from multiple pages 
   using regular expressions and wildcard pagename patterns.
	
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published
   by the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   Syntax:  {(extract Term1 [Term3] [-Term3] ... [group=GroupName] [name=PageName] [keyword=value] ...)}
   See Cookbook:TextExtract for documentation and instructions.
*/
$RecipeInfo['TextExtract']['Version'] = '2023-02-04';

//initialisations
//some manipulation of the 'q' query given
if ($action=='search' && isset($_REQUEST['fmt']) && $_REQUEST['fmt']=='extract') { 
	//enclose any phrase in quote marks
	if (isset($_REQUEST['phrase']) && $_REQUEST['phrase']==1) {
		$_REQUEST['q'] = '"'.trim($_REQUEST['q'],'\'" ').'"';
	}
	//add a space, so FmtPageList() will not transform 'foo/' to group='foo'
	if (isset($_REQUEST['group']) || isset($_REQUEST['name']) || isset($_REQUEST['page'])) {
		 $_REQUEST['q'] = " ".$_REQUEST['q'];
	} 
	//leave out the standard Pmwiki searchresult header and footer text
	$SearchResultsFmt = "\$MatchList";	
	$_REQUEST['q'] = " ".$_REQUEST['q'];
}
$HTMLStylesFmt['textextract'] = " .textextract {margin:0.5em;} ";

// defaults for extractor search form
SDVA($ExtractFormOpt, array(
	'retain-input'  => 1,    //previous input values will be shown in form fields
    'size'   		  => '30', 
    'button'  		=> FmtPageName('&nbsp;$[Search]&nbsp;', $pagename),
    'searchlabel' => FmtPageName('$[Search&nbsp;for&nbsp;]', $pagename),
    'pageslabel' 	=> FmtPageName('$[On&nbsp;pages]', $pagename),
    'caselabel' 	=> FmtPageName('$[Match&nbsp;case]', $pagename), 
    'phraselabel' => FmtPageName('$[Match&nbsp;phrase]', $pagename),        
    'wordlabel' 	=> FmtPageName('$[Match&nbsp;whole&nbsp;word]', $pagename),
    'regexlabel' 	=> FmtPageName('$[Regular&nbsp;expression]', $pagename),
));

// defaults array
SDVA($TextExtractOpt, array (
	'markup' 	 => 'cut', //code, text, source, on
	'unit'   	 => 'dsent', //page, para, line, sent, dline, dsent
	'highlight'=> 'yellow', //background color, 'bold', 'none'
	'linenum-color'  => 'green',
	'matchnum-color' => 'green',
	'pagenum-color'  => 'green',
	'title'     => 'Search results for ', //XL('Text Extract'),
	'header' 		  => 'full',
    'phead' 	  	=> 'link',
	'linewrap'  => 1,
	'rowspacing' => '0.5em',
	'case'   	  => 0,
	'phrase'		=> 0,
	'regex'     => 0,
	'unit-excludes' => 1, //if set to 0, pages which match excludes will be excluded, not just paragraphs or sentences etc
	'error'		  => 1,
	'timer' => 0,
	'pagenum' => 0,
	'matchnum' => 0,
	'linenum' => 0,
	'textlinks' => 0,
	'linktext'  => 'blue',
	'shorten'   => 0,
	'lwords'    => 5,
	'rwords'    => 10,	
	'ellipsis'  => '&hellip;',
	'linebreaks' => 1,
));

SDVA($TERemoveMarkupPatterns, array(
	"/'''(.*?)'''/" => "$1", //'''strong''' (bold)
	"/''(.*?)''/" => "$1", //''emphasis'' (italic)
	"/'\\-(.*?)\\-'/" => "$1", //'-smaller-'
	"/\\[(([-+])+)(.*?)\\1\\]/" => "$1", //[+big+], [-small-]
	"/'\\^(.*?)\\^/" => "$1", //'^super script^'
	"/'_(.*?)_'/" => "$1", //'_sub script_'
	"/\\{\\+(.*?)\\+\\}/" => "$1", //{+ins+} (underline)
	"/\\{-(.*?)-\\}/" => "$1", //{-del-} (strike through)
	"/(%.*?%)/" => "", //%wiki styles% %%
	"/^-+[<>]\\s*/" => "", //->indents, -<outdents
	"/^(\\*+)(.*?)$/m" => "$2",  //* unordered list bullets
	"/^(\\#+)(.*?)$/m" => "$2",  //# ordered list bullets
	"/^(:+)(?=(\s*)([^:]+):)/m" => " ", //: definition : list
	"/ +/" => " ", //multiple spaces to single space
));

// main function for text extract processing
function TextExtract($pagename, $list, $opt = NULL) {
	global $TextExtractOpt, $TEModeDefaults, $TextExtract, $TextExtractExclude, 
		 $FmtV, $HTMLPNewline, $HTMLStylesFmt, $KeepToken, $KPV, $PageListArgPattern, $StrFoldFunction;
##DEBUG echo "<pre>LIST "; print_r($list); echo "</pre>";
	foreach($opt as $k => $v) {
		if (is_array($v))	
			foreach($v as $kk =>$vv)
				$opt[$k][$kk] = stripmagic($vv);
		else $opt[$k] = stripmagic($v);
	}
	//internal arg array
	$par = array();
	//start time
	StopWatch('TextExtract start');
	if ($opt['stime']) $par['stime'] = $opt['stime'];
	else $par['stime'] = strtok(microtime(), ' ') + strtok('');
	$opt = array_merge($TextExtractOpt, $opt);
	$HTMLPNewline = ($opt['linebreaks']==1) ? '<br />' : '';
	switch ($opt['unit']) {
		case 'sentence':  $opt['unit'] = 'sent'; break;
		case 'paragraph': $opt['unit'] = 'para'; break;
		case 'dline':     $opt['unit'] = 'line'; $opt['double'] = 1; break;
		case 'dsent':     $opt['unit'] = 'sent'; $opt['double'] = 1; break;	
	}
	if($opt['markup']=='text') $opt['textlinks'] = 1; //for 'text' mode linksshown  as text
	
##DEBUG echo "<pre>OPT "; print_r($opt); echo "</pre>";		
	//input parameter check
	if (!in_array($opt['unit'], array('line','para','page','sent')) 
		OR !in_array($opt['markup'], array('code','cut','source','text','on')))
			return "%red%$[Error: check input parameters!]";
			
	foreach((array)@$opt['+'] as $i) $opt[''][] = $i; 
	if (!isset($opt['']) && !isset($opt['pattern'])) return '%red%$[Error: search term missing!]';

	//term is regular expression
	if ($opt['regex']==1) {
		$par['pat'] = $pat = $par['pattern'] = $opt[''][0] = $opt['pattern'];
		//exclude various input patterns
		SDVA($TextExtractExclude, array("*","?","+","(",")","[","]","^","$","|","??","\\"));
		foreach($TextExtractExclude as $v)
				if($pat==$v) return '%red%$[Error: disallowed character input!]'; 
	}
	//no regex: term to be parsed and preg charcters escaped
	else { 
		if (isset($opt['phrase']) && $opt['phrase']==1)
			$par['pattern'] = $terms = implode(" ", $opt['']);
		$terms = implode(" + ", $opt['']);
		if (preg_match('/\\".*\\"/',$terms))
			$opt['phrase'] = 1;
		if (isset($opt['-']))
			$terms .= " -".implode(" -", $opt['-']); 
		$par['pattern'] = $terms;	//for display in results header 	
		$pregchars = array('.','?','!','*','|','$','(',')','[',']','{','}',);
		foreach ($pregchars as $v) {
			$opt[''] = str_replace($v,'\\'.$v, $opt['']);	
			if (isset($opt['-'])) $opt['-'] = str_replace($v,'\\'.$v, $opt['-']);
		}
		$pat = implode("|", $opt['']); 
		if (isset($opt['word']) && $opt['word']==1)
			$pat = "\b".$pat."\b";
		$par['pat'] = $pat;

		//add excludes to cut rows
		if (isset($opt['--'])) {
			$par['xcut'] = implode("|",$opt['--']);
			if (isset($opt['word']) && $opt['word']==1) 
				$par['xcut'] = "\b". $par['xcut'] ."\b";
			$par['pattern'] = $par['pattern']." - ". implode(' - ',$opt['--']);
		}
	}
##DEBUG echo "<br>pat= ".($pat); echo " --par['xcut']= ".$par['xcut']; echo " --par['pat']= ".$par['pat']." --par['pattern']= ".$par['pattern'] ;

	$HTMLStylesFmt['teimages'] = " .image {max-width:10em; } ";	
	$HTMLStylesFmt['te-results'] = "div.te-results {height:100%;}";
	//always wrap lines when displaying preformatted 'source' code
	if ($opt['markup']=='source')
		$opt['linewrap'] = 1;
	// wrap lines of preformatted text and code
	if($opt['linewrap']==1) { 
	  # whitespace wrap (perhaps copy styles to css stylesheet)
	  $HTMLStylesFmt['prewrap'] = "
      code, div.te-results pre, div.te-results code, code.escaped, pre.escaped { white-space: pre-wrap; padding-left: 1em; }
	  ";
	} 
	if($opt['rowspacing']!=0) {
		$HTMLStylesFmt['rowspacing'] = ".spacer { min-height: {$opt['rowspacing']};}";
	}
	//setting keep values here, and keeptokens directly in TEHighLight()
	//instead of calling Keep again and again
	switch ($opt['highlight']) {
		case 'none':
			$KPV['01-TE'] = $KPV['02-TE'] = "";
			break;
		case 'bold':
			$KPV['01-TE'] = "<strong>";
			$KPV['02-TE'] = "</strong>";
			break;
		case '1':
		default:
			$KPV['01-TE'] = "<span class='te-hilight'>";
			$KPV['02-TE'] = "</span>";
			$HTMLStylesFmt['te-hilight'] =
				" .te-hilight { background-color: {$opt['highlight']}; } ";
	}
	$par['hitoklen'] = 2* (5 + 2 * strlen($KeepToken)); // 2* ( KeepToken-length + KPV-key-length + KeepToken-length )
	$KPV['03-TE'] = "<br />";
	$par['br-tag'] = $KeepToken."03-TE".$KeepToken;
	$KPV['04-TE'] = "<div class='spacer'><!-- spacer --></div>";
	$par['vspace'] = $KeepToken."04-TE".$KeepToken;

	//header, footer, pagelink prefix styles
	if ($opt['header']=='full') $opt['footer'] = 1;
	if ($opt['phead']) {
		SDV($HTMLStylesFmt['teprefix'], 
	    " .te-pageheader { margin:.8em 0 .5em 0; padding:.2em .2em 0 .2em;} 
	      .te-pageheader { border-top:1px solid #ccc; border-bottom:1px solid #ccc; background:#f7f7f7;}
		");
	}
	if ($opt['header']) {
		SDV($HTMLStylesFmt['teheader'], 
	    " .te-header  {margin-top:0.5em; padding:0.3em; border-top:1px solid #ccc; border-bottom:1px solid #ccc; background:#f7f7f7;}
		");	
	}
	if ($opt['footer']) {
		SDV($HTMLStylesFmt['tefooter'], 
	    " .te-footer {margin-top:0.5em; padding:0.3em; border-top:1px solid #ccc; border-bottom:1px solid #ccc; background:#f7f7f7;}
		");	
	}
	//number color defaults for css styling
	foreach(array('line','match','page') as $c) {
		if (isset($opt[$c.'num'])) $HTMLStylesFmt[$c.'num'] = " .{$c}num { color: {$opt[$c.'num-color']} ;} ";
	}	

	SDV($HTMLStylesFmt['telinktext'],
		" .te-linktext {color: {$opt['linktext']} } ");
		
	//case insensitive search
	$qi = $par['qi'] = (@$opt['case']==1) ? '' : 'i';
	
	$par['listcnt'] = ($FmtV['$MatchSearched']) ? $FmtV['$MatchSearched'] : count($list);
	//inits
	$par['sorcnt']=$par['matchnum']=$par['matchcnt']=$par['rowcnt']=$par['pagecnt']=0; $par['pagenum']= 1;
	$par['title'] = $opt['title'];
	$new = array(); $j = 0;
	//process each page from list of pagelist matches in turn
	foreach($list as $pn) { 
		$par['source'] = $pn;
		$par['pname'] = substr(strstr($pn, '.'),1);
		$par['pmatchnum'] = 0;
		$par['prevpmnum'] = 0;
		$par['hit'] = 0;				
		//get rows from source page
		$rows = TETextRows($pagename, $pn, $opt, $par); 
	#DEBUG 	echo "<br />".$pn; #show($rows,'rows');	
		if (!$rows) continue;
		//next page
		$j++;
		//processing lines (rows)
		foreach ($rows as $k => $row) {
			$par['linenum'] = $k+1;
			//skip pages which don't match
			if ($opt['unit']=='page') if(!preg_match("($pat)".$qi, $row)) continue; 
			//preserve empty rows for 'all including' pattern
			if (($opt['unit']=='line'|| $opt['unit']=='sent') && $row=="" && $pat==".") { 
					$new[$j]['rows'][] = $row; continue; 
			}
			//use row 'as is' if markup=on or whole page, no futher row processing
			if ($opt['markup']=='on' && ($pat=="." || $opt['unit']=='page' || $opt['unit']=='para')) {
				 	if ($opt['unit']=='para' && !preg_match("($pat)".$qi, $row)) continue;
					$new[$j]['phead'] = TEPageHeader($pagename, $pn, $opt, $par);
					$new[$j]['rows'][] = $row; 
					$par['rowcnt']++;
					continue; //start with next source row
			}
			//change some markup into code or 'defuse', so it will not get rendered, or cut it 
			$row = TEMarkupCleaner($row, $opt, $par);
			//skip rows which don't match
			if ($opt['unit']=='line' || $opt['unit']=='sent' || $opt['unit']=='para' || $opt['unit']=='page') {
					if(preg_match("({$par['pat']})sm".$qi, $row))  { $par['hit'] = 1; }
					else { 
						if(isset($opt['double']) && $opt['double']==1 && $par['hit']==1) $par['hit']=0; 
						else continue; 
					}
			}
			//exclude lines containing matches with cut pattern
			if (isset($opt['cut']))
					if(preg_match("({$opt['cut']})".$qi, $row)) continue;
			//exclude lines containing matches with xcut pattern (from excludes)
			if (isset($par['xcut']))
					if(preg_match("({$par['xcut']})".$qi, $row)) continue;					
			//count matches in row
			$par['rowmatchcnt'] = preg_match_all("(".$par['pat'].")".$qi, $row, $mr);
			//check if textrow needs processing
			if(isset($opt['snip'])) 
				$row = preg_replace("({$opt['snip']})", '', $row);
			$row = ltrim($row);
			//empty row	
			if ($row=='') continue;	
			//highlight matches
			if(isset($opt['highlight']) && $pat!='.')
					$row = TEHighlight($opt, $par, $row);
			//numbering
			$par['pagenum'] = $j; //$par['pagecnt']+1; //from prev version
			$par['rowcnt']++;  
			//show($par['pagenum'],'par pagenum');
			$new[$j]['rowcnt'] = $par['rowcnt'];
			$new[$j]['pmatchcnt'] = $par['rowmatchcnt'] ;
			$par['prevmnum'] = $par['matchnum'];
			$par['matchcnt'] = $par['matchnum'] += $par['rowmatchcnt'];
			$par['prevpmnum'] = $par['pmatchnum'];
			$par['pmatchnum'] += $par['rowmatchcnt'];
			$rownum = ($opt['linenum']==1 || $opt['matchnum']==1 || $opt['pagenum']==1) ?
					TERowNumbers($opt, $par) : '';
			if(!isset($rownum))  continue;
			//add new result row
			$rc = $new[$j]['rowcnt']; 
			if($par['hit']==1) {
				$new[$j]['rows'][$rc] = $rownum.$row;
			} else { //hit=0
				#$new[$j]['rows'][$rc-1] = '';
				$new[$j]['rows'][$rc-1] = trim($new[$j]['rows'][$rc-1],"\t\n\r\0\x0B")." ".trim($row);  
			}
			//add vertical spacing to para and double
			if (($opt['unit']=='para') && $opt['markup']!='source') 
					$new[$j]['rows'][] = "\n";
		} //end of page rows processing
		if (isset($new[$j]) && is_countable($new[$j]['rows']) && count($new[$j]['rows'])>0) {
			//add pagelink (prefix) row
			if($opt['phead'])
					$new[$j]['phead'] = TEPageHeader($pagename, $pn, $opt, $par);		
			$par['sorcnt']++;
			if (isset($opt['pfoot']))
				$new[$j]['pfoot'] = TEPageFooter($pagename, $pn, $opt, $par);
			$new[$j]['name'] = $pn;
		}
	} //end of source pages processing

	//slice list if we got #section
	if (@$opt['section'] && @$opt['count'])	TESliceList($new, $opt);
	$par['pagecnt'] = count($new); 
	//sort list by results per page, subsort by name
	if (isset($opt['order']) && $opt['order']=='results')	TESort($new);
## DEBUG echo "<pre>NEW "; print_r($new); echo "</pre>";
	//output text from array of rows, adding page prefix header (and footer)
	$out = '';
	foreach ($new as $i => $ar) {
		//markup pageheader
		if($opt['phead'])
			$out .= MarkupToHTML($pagename, $new[$i]['phead']);
		//add vspace
		foreach($new[$i]['rows'] as $k => $r) {
			if(isset($new[$i]['rows'][$k])) {
				$new[$i]['rows'][$k] = TEVSpace($r, $par, $opt); //add vertical spacing	
			}
		}
		//markup rows
		$rnew = implode("\n", $new[$i]['rows']);
		global $LinkFunctions;
		if ($opt['textlinks']==1) { 
			$lf = $LinkFunctions;
			foreach($LinkFunctions as $k => $v)
				$LinkFunctions[$k] = 'TELinkText';
		}
		$out .= ($opt['markup']=='source') ? "<code class='escaped'>".$rnew."</code>"
					: MarkupToHTML($pagename, $rnew);
		if ($opt['textlinks']==1)	$LinkFunctions = $lf;
		//markup pagefooter
		if (isset($opt['pfoot']))
			$out .= MarkupToHTML($pagename, $new[$i]['pfoot']);
	}
	//stop timer
	TEStopwatch($par);	
	//make header and footer
	$header = TEHeader($opt, $par);
	$header = MarkupToHTML($pagename, $header); 
	$footer = TEFooter($opt, $par);
	$footer = MarkupToHTML($pagename, $footer);		
	$out = $header."<div class='te-results'>".$out."</div>".$footer;
	StopWatch('TextExtract end'); 
	return Keep($out);
} //}}}


//make rows array from source page
function TETextRows($pagename, $source, $opt, &$par ) {
	if ($source==$pagename) return '';
	$page = ReadPage($source);
	if (!$page) return '';
  $text = $page['text'];
	//use pagename#section if present
	if(isset($opt['section']))
		$text = TextSection($text, $source.$opt['section']);
  //remove inline markup from text
  if ($opt['phrase']==1 || $opt['markup']=='text')
    $text = TERemoveInlineMarkup($text);
  //skip page if it has an exclude match
  if (isset($opt['pat']['-']) && $opt['pat']['-']!='')
	  foreach ($opt['-'] as $pat) {
			if (preg_match("($pat)".$par['qi'], $text)) return; } 
  //skip page if it has no match; all inclusive elements need to match (AND condition)
  /*foreach($opt[''] as $v) {
	$f = str_replace("\xc3\x9f","ss",$v); 
	$opt[''][] = $f;
  }*/
  #show($opt[''],'opt[]');
  #foreach ($opt[''] as $pat)  {
  #  if (!preg_match("($pat)".$par['qi'], $text)) return; }		

	$text = rtrim(Qualify($source, $text));
	$rows = explode("\n", rtrim($text)); //make text lines into rows array
	//use range of lines
	if(isset($opt['lines'])) {
			$ol = $opt['lines'];
			$cnt = count($rows);
			if(strstr($ol,'..')) {
				preg_match_all("/\d*/", $ol, $k);
				$a=$k[0][0];  $b=$k[0][3]; $c=$k[0][2];
				if($a && $b) 
					$rows = array_slice($rows, $a-1, $b-$a+1);
				else if($a)
					$rows = array_slice($rows, $a-1);
				else if($c)
					$rows = array_slice($rows, 0, $c);
			}		
			else if($ol[0]=='-')
				$rows = array_slice($rows, $ol);
			else $rows = array_slice($rows, 0, $ol); 
			
	}
	switch ($opt['unit']) {
		//unit=line - already got line rows
		default:  break; 
		//unit=sent (sentence) - split lines into sentences
		case 'sent': 
				$re = '/# Split sentences on whitespace between them.
		    (?<=[.!?]|[.!?][\'"])(?<!
		      Mr\.| Mrs\.| Ms\.| Jr\.| Dr\.| Prof\.| Sr\.                       
		    )\s+/ix';
		    $nr = array(); 
		    foreach($rows as $k => $r) {
		    	if($r=='') $r = ' '; //continue;
					$nr = array_merge($nr, preg_split($re, $r, -1, PREG_SPLIT_NO_EMPTY)); 
				};
				$rows = $nr;
				break;
		//unit=para: - combine rows to paragraph rows
		case 'para':
			$paras = array(); $j=0;
			$paras[0] = '';
			foreach($rows as $r) {
				$r = rtrim($r);
				if ($r=='') { 
					$j++;
					$paras[$j] = ''; 
					continue; 
				}
				$paras[$j] .= $r."\n";
			}
			$rows = $paras;
			break;
		//unit=page: - combine rows into one row
		case 'page':
				$part = implode("\n",$rows);
				unset($rows);
				$rows[0] = $part;
				break;
	} 
	return $rows;
} //}}}


function TERemoveInlineMarkup($text) {
	global $TERemoveMarkupPatterns;
	foreach($TERemoveMarkupPatterns as $pat => $rep)
		$text = preg_replace($pat, $rep, $text);
	return $text;
} //}}}

//cleanup of markup
function TEMarkupCleaner($row, $opt, $par) {
	global $KeepToken;
	if ($opt['markup']=='source') {
		//clean <>"tag" characters 
		$row = str_replace("<","&lt;", $row);
		$row = str_replace(">","&gt;", $row);
		//that's all for 'source' processing
		return $row;
	}
	$new = array();
	//fix orphaned @],[@,=],[= 
	foreach(array("@","=") as $x) {
		$a = strpos($row,'['.$x); $b = strpos($row,$x.']');
		if ($b!=0 && ($a===false || $a>$b)) $row = '['.$x.$row;
		else if ($a!=0 && ($b===false || $a>$b)) $row .= $x.']';
	}	
	//keep escaped text using tokens
	$keep = array();
	if (preg_match_all("/\\[([=@])(.*?)\\1\\]/s".$par['qi'], $row, $m)) {
		foreach ($m[0] as $i => $v) {
			$keep[$i][0] =  $v;
			$keep[$i][1] = $m[1][$i];
			$row = str_replace( $v, "<__TOK__".$i."__>", $row);
		}
	}
	//directives (: ... :) possibly multi-line
	if ($opt['markup']=='cut' || $opt['markup']=='text') {
			$row = preg_replace("/\\(:(\\w+\\b.*?):\\)/s", "", $row);
	}
	$lines = explode("\n", $row);
	foreach ($lines as $k => $row) {
			//extra spaces
			$row = preg_replace("/\\n\\s+/", "\n", $row);
			//directives (: ... :) encoding
			if ($opt['markup']=='code') {
				$row = preg_replace("/\\(:(comment)\\s+(.*?)\\s*:\\)/", "[@(:$1:@] $2 :)", $row); 
				$row = preg_replace("/\\(:(\\w+\\b.*?):\\)/", "[@(:$1:)@]", $row);
			}
			//fixing double and empty [@ and [=
			$row = preg_replace("/\\[([@=])\\s*\\[\\1/","[\\1",$row);
			$row = preg_replace("/([@=])\\]\\s*\\1\\]/","\\1]",$row);
			$row = preg_replace("/\\[([@=])\\s*\\1\\]/","",$row);
			//whitespace						
			$row = preg_replace("/^\\s+/", "", $row);
			//A: Q: 
			$row = preg_replace("/^[AQ]:\\s+/", "", $row);
			//code and cut treat some markup differently
			if ($opt['textlinks']==1) {
					//variable link
					global $WikiWordPattern;
					$row = preg_replace("/\\$($WikiWordPattern)\\b/", "&#36;$1", $row);			
			}
			switch($opt['markup']) {
				case 'text':
					$row = TERemoveInlineMarkup($row);
					//follow on with 'cut'						
				case 'cut':
					//divs >>...<< : remove
					$row = preg_replace("/>>(.*?)<</", "", $row);
					//anchors : remove
					$row = preg_replace("/(\\[\\[#[A-Za-z][-.:\\w]*\\]\\])/","",$row);
					//Attach:..... : remove
					$row = preg_replace("/(Attach:.*?)/","",$row);
					break;	
				case 'code':
					//indents: remove
					$row = preg_replace("/^-+[<>]\\s*/", "", $row);				
					//unordered list items: bullets to * 
					$row = preg_replace("/^(\\*+)(.*?)$/", "&#42;$2", $row);
					//ordered list items: numerals to #
					$row = preg_replace("/^(\\#+)(.*?)$/", "&#35;$2", $row);				
					//definition list items: to :
					$row = preg_replace("/^(:+)(?=(\s*)([^:]+):)/", "&#58; ", $row);
					//divs >>...<< 	: escape			
					$row = preg_replace("/>>(.*?)<</", "[@>>$1<<@]", $row);
					//anchors: escape
					$row = preg_replace("/(\\[\\[#[A-Za-z][-.:\\w]*\\]\\])/","[@$1@]",$row);
					//wiki styles %...% : escape
					$row = preg_replace("/(%.*?%)/", "[@$1@]", $row);
					//tables || || || @ escape
					$row = preg_replace("/^\\|\\|(.*)$/", "[@||$1 @]", $row); 
					break;
			}	
			//change all headings to large and bold text
			$row = preg_replace("/^(!{1,6})(.*)/","[+''' $2 '''+]" , $row);
			//markup expression encoding
			$row = preg_replace("/\\{\\((\\w+\\b.*?)\\)\\}/", "[@{($1)}@]", $row); 
			$row = trim($row);
			if ($row=='') continue;
			$new[$k] = $row;
		}
	$row = implode("\n", $new);
	//re-inserting code strings via tokens		
	foreach ($keep as $i => $v)
			$row = str_replace("<__TOK__".$i."__>", $keep[$i][0], $row);
	return $row;
} //}}}


//insert markup for highlighting matches
function TEHighlight($opt, &$par, $row) {
	global $LinkPattern, $UrlExcludeChars, $ImgExtPattern, $KeepToken, $KPV;
	//for source view we don't want whole links highlight:
	if ($opt['markup']=='source') $linkpat = $urlpat = '';
	else {
		//matches in links: highlight entire link, and other matches
		$linkpat = "\\[\\[\\s*(.*?)\\]\\]";
		$urlpat = "($LinkPattern)\\/\\/([^\\s$UrlExcludeChars]*[^\\s.,?!$UrlExcludeChars])";
	}
	if (preg_match_all("(($linkpat)|($urlpat)|({$par['pat']}))u".$par['qi'], $row, $m, PREG_OFFSET_CAPTURE)) {
		## DEBUG echo "<br> PATTERN: ".$par['pat']; echo "<pre>OTHER "; print_r($m[0]); echo "</pre>";
		$k = 0; $mpos = array();
		foreach($m[0] as $i => $v) { 
			if (!preg_match("({$par['pat']})u".$par['qi'], $v[0])) continue;
			if (isset($m[4]) && preg_match("/$LinkPattern/",$m[4][$i][0]))
					$item = $v[0]." ";
			else $item = $v[0];
			$pos = $v[1] + $k * $par['hitoklen'];
			$row = substr_replace($row, $KeepToken."01-TE".$KeepToken.$item.$KeepToken."02-TE".$KeepToken, $pos, strlen($item));
			$row = rtrim($row,'% ');
			$k++;
			$mpos[] = $pos;
		}
		if ($opt['shorten']>0 && $opt['markup']!='source')
			$row = TEShortenRow($row, $par, $opt);
	}
	return $row;
} //}}}


function TEVSpace($row, $par, $opt) {
	global $HTMLPNewline;
	if ($opt['markup']=='source') 
		return trim($row);
	if($HTMLPNewline !='')
		return $row;
	if($opt['shorten']>0) {
		$HTMLPNewline = '';
		 return $row.$par['vspace'];
	}
	else return $row.$par['br-tag'];
} //}}}


//shorten row
function TEShortenRow($row, $par, $opt) {
	global $KeepToken;
	//number of words left and right of highlight
	$a = ($opt['shorten']>1) ? $opt['shorten'] : $opt['lwords']; 
	$b = ($opt['shorten']>1) ? 2*$opt['shorten'] : $opt['rwords'];
	$hi = $new = array();
	$words = explode(' ', $row);
	foreach ($words as $i => $wd)
			if (strpos($wd, $KeepToken)!==false) $hi[] = $i;
	for ($i=0; $i < count($words); $i++) {
		foreach ($hi as $k => $n) {
			if (($n-$a) > $i) { 
				if (($n-$a) == $i+1)
					if (!isset($new[$i])) $new[$i] = $opt['ellipsis'];
				if (isset($new[$i-1]) && $new[$i-1]!=$opt['ellipsis']) $new[$i] = $opt['ellipsis'];
				continue 2; 
			}
			if ($i == end($hi)+$b+1) $new[$i] =  $opt['ellipsis'];
			if ($i > $n+$b) continue;
			if(isset($hi[$k+1]) && $i==($hi[$k+1]-$a)) continue;
			if (isset($new[$i])) continue 2;
			$new[$i] = $words[$i]; 
			continue 2;
		}
	}
	$row = implode(' ', $new);
	return $row;
} //}}}


//make header
function TEHeader(&$opt, $par) {
	$cnt = $par['matchnum'];
	$out = "";
	if ($opt['header']) $out .= "(:div001 class='te-header':)\n";
	switch($opt['header']) {
		default: 
			$out .= TEVarReplace($opt['header'], $par);
			break;
		case 'count':
		case 'counter':
			$out .= "'''$[Results:] $cnt'''";
			break;
		case 'all':
		case 'full':
			$time = ($opt['timer']) ? 'in '.$par['time'] : '';
			$pgs = ($par['listcnt']>1) ? '$[pages]' : '$[page]';
			$from = "$[from] {$par['listcnt']} $pgs $[searched]";
			if ($par['pagecnt']>1)
				$from = "$[on] {$par['pagecnt']} $[pages] ".$from;
			$out .= "[[#extracttop]]%lfloat%[+ '''{$opt['title']} &nbsp;&nbsp;%green%{$par['pattern']}%%''' +]  %right%''{$cnt} $[results]  {$from}  {$time}''";
			$opt['footer'] = "%center% '''$[End of] {$opt['title']}'''  &nbsp;&nbsp; [[#extracttop|$[(start)]]]";
			break;
	}
	if ($opt['header']) $out .= "\n(:div001end:)";
	return $out;	
} //}}}

//make footer
function TEFooter($opt, $par) {
	$out = '';
	if ($opt['footer'] && $par['pagecnt']>0) {
		$out .= "\n(:div002 class='te-footer':)".TEVarReplace($opt['footer'], $par)."\n(:div002end:)";
	}
	if($opt['error']==1) {
			$error = ($par['pagecnt']==0) ? "\n%red%$[Found no matches!]%%" : '';
			$error = ($par['listcnt']==0) ? "\n%red%$[Error: no pages to be searched!]%%" : '';
			$out .= $error;
	}
	return $out;	
} //}}}

//make page header
function TEPageHeader($pagename, $source, $opt, &$par) {
	$pnum = ($opt['pagenum']==1) ? ($par['pagenum']).". " : '';
	$out = "\n>>te-pageheader<<\n";
	if($opt['phead']=='link') {
		if($opt['pagenum']==1 && $opt['pagenum-color']!='')
			$out .= "'''%color={$opt['pagenum-color']}%{$pnum}%% [+ [[$source]] +]'''";
		else
			$out .= "'''[+ [[$source]] +]'''";
	}
	elseif($opt['phead']=='linkmod' ) {
		$lmod = PageVar($source,'$LastModified');
		$lmby = PageVar($source,'$LastModifiedBy');
		$out .= "%rfloat%''$[last modified by] [[~{$lmby}]] $[on] {$lmod}'' %left%'''%color={$opt['pagenum-color']}%{$pnum}%%[+ [[$source]] +]'''";
	}
	else {
		$out .=  TEVarReplace($opt['phead'], $par);
	}
	$out .= "\n>><<\n";
	return $out;
} //}}

//make page footer
function TEPageFooter($pagename, $source, $opt, &$par) {
	$out = "\n".$opt['pfoot'];
	return $out;
} //}}

//make results (line) numbers
function TERowNumbers($opt, $par) {
	#show($par,'PAR');
	$new = '';
	if ($opt['linenum']==1) {
		if ($opt['pagenum']==1) {
			$new = Keep("<span class='pagenum'>{$par['pagenum']}.</span><span class='linenum'>{$par['linenum']}. </span>",'T');
		} else
			$new = Keep("<span class='linenum'>{$par['linenum']}. </span>",'T');
	} else
	if ($opt['matchnum']==1  && $par['pat']!=".") {
		if ($opt['pagenum']==1) {
			if ($par['rowmatchcnt']>1)
				$num = ($par['prevpmnum']+1)."-".$par['pmatchnum'];
			else $num = $par['pmatchnum'];		
			$new = Keep("<span class='pagenum'>{$par['pagenum']}.</span><span class='matchnum'>$num. </span>",'T');
		} else {
		if ($par['rowmatchcnt']>1)
			$num = ($par['prevmnum']+1)."-".$par['matchnum'];
		else $num = $par['matchnum'];		
			$new = Keep("<span class='matchnum'>$num. </span>",'T');
		}			
	}
	return $new; 	
} //}}}

//substitution of pseudo template variables
function TEVarReplace ($text, $par) {
	foreach($par as $k => $v) {
		if (is_array($v)) continue;
		$text = str_replace('{$$'.$k.'}' , $v, $text);
	}
	return $text;
} //}}}

//Link function to suppress links
function TELinkText($pagename,$imap,$path,$title,$txt,$fmt=NULL) {
	return "<span class='te-linktext'>".$txt."".$title."</span>"; 
} //}}}

//timer
function TEStopwatch(&$par) {
		$wtime = strtok(microtime(), ' ') + strtok('') - $par['stime'];
		$xtime = sprintf("%04.2f %s", $wtime, ''); //time in secs	
		$par['time'] = $xtime." $[seconds]";
} //}}}


// markup (:extract ....:) search form
Markup('extractform', 'directives','/\\(:extract\\s*(.*?)\\s*:\\)/', "TEFormMarkup");
// extractor search form
function TEFormMarkup($m) {
	global $ExtractFormOpt, $InputValues, $EnablePathInfo,$ExtractFormInputType;
	extract($GLOBALS['MarkupToHTML']);
	$opt = ParseArgs($m[1]);
	if (isset($opt['page'])) $hiddenpagefield =  1;
	$opt = array_merge((array)$ExtractFormOpt,  (array)$opt);
	$opt['action'] = 'search';
	$opt['fmt'] = 'extract';
	$target = (isset($opt['target'])) ? MakePageName($pagename, $opt['target']) : $pagename;
  	$opt['n'] = IsEnabled($EnablePathInfo, 0) ? '' : $target;
	if ($opt['retain-input']==1) {
		foreach ($_GET as $k=>$v) 
   			$InputValues[$k] = htmlspecialchars($v);
	}	
	foreach ($opt as $k => $v) {
		if ($v == '' || is_array($v)) continue;
		$v = str_replace("'", "&#039;", $v);
		$opt[$k] = $v;
		if (!isset($InputValues[$k])) $InputValues[$k] = $v;
	}	
	if(!isset($InputValues['q'])) $InputValues['q'] = '';
	if (!isset($InputValues['q']) && isset($opt['pattern'])) $InputValues['q'] = $opt['pattern'];
	if (!isset($InputValues['name']) && isset($opt['defaultpage'])) $InputValues['name'] = $opt['defaultpage'];
	else $InputValues['name'] = '';
	$checkword = (isset($InputValues['word']))? "checked=1" : '';
	$checkcase = (isset($InputValues['case']))? "checked=1" : '';
	$checkphrase = (isset($InputValues['phrase']))? "checked=1" : '';
	$checkregex = (isset($InputValues['regex']))? "checked=1" : '';
	SDV($ExtractFormInputType, 'text'); 
	//form
	$out = FmtPageName("<form class='wikisearch' action='\$PageUrl' method='get'>", $target);
	$out .= "\n<table>";
	if (isset($opt['pattern']))
		$out .= "<input type='hidden' name='q' value='{$InputValues['q']}' /> \n";
	else $out .= "<tr><td>{$opt['searchlabel']} </td><td><input type='$ExtractFormInputType' name='q' value='{$InputValues['q']}' class='inputbox searchbox' size='{$opt['size']}' /> </td></tr> \n";
	if (!isset($hiddenpagefield))
		$out .= "<tr><td>{$opt['pageslabel']} </td><td><input type='text' name='name' value='{$InputValues['name']}' class='inputbox searchbox' size='{$opt['size']}' /> </td></tr> \n";
	if (!isset($opt['pattern'])) {
		if (!isset($opt['case']))   $out .= "<tr><td></td><td><input type='checkbox' name='case' value='1' $checkcase/> {$opt['caselabel']}</td></tr>";
		if (!isset($opt['phrase'])) $out .= "<tr><td></td><td><input type='checkbox' name='phrase' value='1' $checkphrase/> {$opt['phraselabel']}</td></tr>";
		if (!isset($opt['word']))   $out .= "<tr><td></td><td><input type='checkbox' name='word' value='1' $checkword/> {$opt['wordlabel']}</td></tr>";
	}
	if (isset($opt['regex']))
		$out .= "<tr><td></td><td><input type='checkbox' name='regex' value='1' $checkregex/> {$opt['regexlabel']}</td></tr>";
	$out .= "<tr><td></td><td>&nbsp;&nbsp;&nbsp;&nbsp;<input type='submit' class='inputbutton searchbutton' value='{$opt['button']}' /></td></tr></table> \n";
	//set other optional parameters as hidden fields
	foreach ($opt as $k => $v) {
		if ($v == '' || is_array($v)) continue;
		if (in_array($k, array('pattern','name','defaultpage','q','label','value','size','searchlabel','pageslabel','wordlabel','caselabel','regexlabel','regex'))) continue;
		$k = str_replace("'", "&#039;", $k);
		$v = str_replace("'", "&#039;", $v);
		$out.= "\n<input type='hidden' name='".$k."' value='".$v."' />";
	}	
	$out .= "</form>";
	return Keep($out);
} //}}}


## (extract ......) same as PowerTools (pagelist.... fmt=extract) [all pagelist parameters allowed]
$MarkupExpr['extract'] = 'MxTextExtract($pagename, $argp, $args)'; 
function MxTextExtract($pagename, $argp, $args) {
	StopWatch('extract start');
	unset($argp['#']);
	$opt['fmt'] = 'extract';
	foreach($argp as $k => $v)	$opt[$k] = $v;
	foreach($args as $k => $v) $opt['q'] .= ' "'.$v.'"';
	$out = FmtPageList('$MatchList', $pagename, $opt, 0); 
	$out = preg_replace("/[\n]+/s","\n",$out);
	StopWatch('extract end');
	return $out;
} //}}}


//fmt=extract for (:extract:) and (:pagelist:) and (:searchbox:)
SDV($FPLFormatOpt['extract'], array('fn' =>  'FPLTextExtract'));
function FPLTextExtract($pagename, &$matches, $opt) {
	##DEBUG echo "<pre>OPT "; print_r($opt); echo "</pre>";	
	global $FmtV, $EnableStopWatch, $KeepToken, $KPV, $PageListFilters;
	$PageListFilters['PageListTermsTargets'] = -10; //not used
	$PageListFilters['TEListTermsTargets'] = 160; //used as alternative
	$EnableStopWatch = 1;
	StopWatch('TextExtract pagelist begin');
	$opt['stime'] = strtok(microtime(), ' ') + strtok('');
	$opt['q'] = ltrim($opt['q']);
	//if search term contains terms in double quotes switch on 'text' option to remove all inline markup when searching
	if (preg_match('/\\".*\\"/',$opt['q'])) $opt['text'] = 1;
	if (@$opt['']) foreach ($opt[''] as $k => $v)
			$opt[''][$k] = htmlspecialchars_decode($v);
	//treat single . search term as request for regex 'all characters'
	if(isset($opt[''][0]) && $opt[''][0]=='.') $opt['regex'] = 1;
	if(isset($opt['pattern']) && $opt['pattern']=='.') $opt['regex'] = 1;
	//MakePageList() does not evaluate terms as regular expressions, so we save them for later
	if (@$opt['regex']==1) {
		$opt['pattern'] = implode(' ', $opt['']);
		unset($opt['']);
	}
	if (!isset($opt['name']) && isset($opt['page'])) $opt['name'] = $opt['page'];
	elseif (isset($opt['name']) && isset($opt['page'])) $opt['name'] .= ",".$opt['page'];
	if (isset($opt['name'])) unset($opt['page']);
	//allow search of anchor sections
	if (isset($opt['name'])) {
		if($sa=strpos($opt['name'],'#')) {
			$opt['section'] = strstr($opt['name'],'#');
			$opt['name'] = substr($opt['name'],0,$sa);
		}
	}
	//add excludes to '--', for later cutting of matching rows
	if(isset($opt['-'])) $opt['--'] = $opt['-'];
	//unset excludes for page matching, except if unit is 'page' (and it will be cut anyway if matched)
	if ($opt['unit']!='page' && isset($opt['-'])) unset($opt['-']); 
	//create page list by searching pages for search terms
	$list = MakePageList($pagename, $opt, 0);
	#DEBUG 	echo "<pre>list after MakePageList "; print_r($list); echo "</pre>";
	//extract page subset according to 'count=' parameter
	if (@$opt['count'] && !$opt['section'])
		TESliceList($list, $opt);
	return TextExtract($pagename, $list, $opt);
} //}}}

//alternative for PageListTermsTargets with hook to TERemoveInlineMarkup for option 'text'
//this allows page matches to a search phrase even if part of the phrase is enclosed with inline markup
function TEListTermsTargets(&$list, &$opt, $pn, &$page) {
	global $FmtV, $StrFoldFunction;
  	static $reindex = array();
  	$fold = $StrFoldFunction;
  	switch ($opt['=phase']) {
    	case PAGELIST_PRE:
	      $FmtV['$MatchSearched'] = count($list);
	      $incl = array(); $excl = array();
	      foreach((array)@$opt[''] as $i) { $incl[] = $fold($i); }
	      foreach((array)@$opt['+'] as $i) { $incl[] = $fold($i); }
	      foreach((array)@$opt['-'] as $i) { $excl[] = $fold($i); }
	      $indexterms = PageIndexTerms($incl); 
	      foreach($incl as $i) {
	        $delim = (!preg_match('/[^\\w\\x80-\\xff]/', $i)) ? '$' : '/';
	        $opt['=inclp'][] = $delim . preg_quote($i,$delim) . $delim . 'i';
	      }
	      if ($excl) 
	        $opt['=exclp'][] = '$'.implode('|', array_map('preg_quote',$excl)).'$i';
	      if (@$opt['link']) {
	        $link = MakePageName($pn, $opt['link']);
	        $opt['=linkp'] = "/(^|,)$link(,|$)/i";
	        $indexterms[] = " $link ";
	      }
	      if (@$opt['=cached']) return 0;
	      if ($indexterms) {
		       StopWatch("PageListTermsTargets begin count=".count($list));
		       $xlist = PageIndexGrep($indexterms, true);
		       $list = array_diff($list, $xlist);
		       StopWatch("PageListTermsTargets end count=".count($list));
	    	}
	      if (@$opt['=inclp'] || @$opt['=exclp'] || @$opt['=linkp']) 
	        return PAGELIST_ITEM|PAGELIST_POST; 
	      return 0;

	   case PAGELIST_ITEM:
	      if (!$page) { $page = ReadPage($pn, READPAGE_CURRENT); $opt['=readc']++; }
	      if (!$page) return 0;
	      if (@$opt['=linkp'] && !preg_match($opt['=linkp'], @$page['targets'])) 
	         { $reindex[] = $pn; return 0; }
	      if (@$opt['=inclp'] || @$opt['=exclp']) {
	      	$text = $fold($pn."\n".@$page['targets']."\n".@$page['text']);
	      	if (isset($opt['text']) && $opt['text']==1) 
	      		$text = TERemoveInlineMarkup($text); 
	       	foreach((array)@$opt['=exclp'] as $i) 
	         	if (preg_match($i, $text)) return 0;
	       	foreach((array)@$opt['=inclp'] as $i) 
	         	if (!preg_match($i, $text)) {
	            	if ($i[0] == '$') $reindex[] = $pn;
	            	return 0; 
	          	}
	      }
	      return 1;
	      
	   case PAGELIST_POST:
	      if ($reindex) PageIndexQueueUpdate($reindex);
	      $reindex = array();
	      return 0;
  }
} //}}}

//slice list for count= option
function TESliceList(&$list, $opt) {
		list($r0, $r1) = CalcRange($opt['count'], count($list));
		if ($r1 < $r0) 
			$list = array_reverse(array_slice($list, $r1-1, $r0-$r1+1));
		else 
			$list = array_slice($list, $r0-1, $r1-$r0+1);	
} //}}}

//sort by match count and subsort by name
function TESort(&$new) {
	usort($new,"TESortByMatchCnt");
	$anew = $temp = array();
	$cnt = count($new);
	for ($i=0; $i<$cnt; $i++) {
		$temp[] = $new[$i];
		if (($new[$i]['pmatchcnt'] > $new[$i+1]['pmatchcnt']) || $i+1==$cnt) { 
			if (count($temp)>1)	usort($temp, "TESortByName");
			$anew = array_merge($anew, $temp);
			unset($temp);			
		}
	}
	$new = $anew;
} //}}}

//is_countable substitute for php versions <7.3
if (!function_exists('is_countable')) {
    function is_countable($c) {
        return is_array($c) || $c instanceof Countable;
    }
}

//sort helper functions
function TESortByMatchCnt($a, $b) { return $b['pmatchcnt'] - $a['pmatchcnt']; }
function TESortByName($a, $b) { return strnatcasecmp($a['name'], $b['name']); }
//EOF