<?php if (!defined('PmWiki')) exit();
/**
  MigrateUTF8: Convert PmWiki page and attachment filenames to UTF-8
  Written by (c) Petko Yotov 2023-2024

  This text is written for PmWiki; you can redistribute it and/or
  modify it under the terms of the GNU General Public License
  as published by the Free Software Foundation; either version 3
  of the License, or (at your option) any later version.
  See pmwiki.php for full details and lack of warranty.
*/

$RecipeInfo['MigrateUTF8']['Version'] = '2024-07-14a';

$PmTOC['Enable'] = 0;
$EnableSortable = 1;
$EnableCopyCode = 1;
$EnableHighlight = 1;

SDVA($MigrateUTF8, array(
  'MaxSeconds' => 10,
  'OldCharset' => 'Windows-1252',
  'TempSuffux' => '-temp-utf8',
  'BatchCheckTime' => 20,
));

SDVA($MigrateUTF8['charsets'], array(
  'Windows-1252' => 'Latin',
  'ISO-8858-1' => 'Latin',
  'ISO-8858-15' => 'Latin+Euro',
  'ISO-8858-2' => 'Central Europe',
  'ISO-8858-9' => 'Turkish',
  'ISO-8858-13' => 'Baltic',
  'Windows-1251' => 'Cyrillic',
  'ISO-8858-5' => 'Cyrillic',
));

SDVA($HandleActions, array('migr8'=>'HandleMigrateUTF8'));

$MarkupDirectiveFunctions['migrate-charsets'] = 'FmtMigrateCharsets';

function FmtMigrateCharsets($pagename) {
  global $MigrateUTF8;
  $out = '';
  foreach($MigrateUTF8['charsets'] as $charset => $label) {
    $c = PHSC($charset, ENT_QUOTES);
    $l = PHSC($label, ENT_QUOTES);
    $out .= "(:input select oldcharset \"$c\" \"$c ($l)\":)";
  }
  return $out;
}


function HandleMigrateUTF8($pagename) {
  global $Now, $EnableReadOnly, $Charset, $SiteAdminGroup, $MigrateUTF8, 
    $WorkDir, $UploadDir, $PageIndexFile, $MessagesFmt, $EnableMigratePartial, $EnableMigrateHasUploads,
    $PageStartFmt, $PageEndFmt, $HTMLStylesFmt, $EnableMigrateDone;
  $pn = "$SiteAdminGroup.$SiteAdminGroup";
  $page = RetrieveAuthPage($pn, 'admin', true, READPAGE_CURRENT);
  if(!$page) return Abort('?No permissions.');
  
  $todo = "";
  
  if(!$EnableReadOnly) $todo .= "@@&#36;EnableReadOnly = 1;@@\\\\\n";
  if($Charset != 'UTF-8') $todo .= "@@include_once(\"&#36;FarmD/scripts/xlpage-utf-8.php\");@@\n";
  

  if($todo) {
    $out = <<<EOF
You need to set in config.php:

>>hlt php indent<<
$todo
>><<

Then [[$pagename?action=migr8|refresh this page]].

EOF;
  
    return MigratePrintFmt($pagename, $out);  
  }
  
  if(is_dir("$WorkDir,old") || is_dir("$UploadDir,old")) {
    $out = <<<EOF
It seems the wiki may already have been migrated.

If this isn't the case, please remove the directories:

  "$WorkDir,old" and "$UploadDir,old"

Then [[$pagename?action=migr8|refresh this page]].

EOF;
  
    return MigratePrintFmt($pagename, $out);
  }
  
  $oldcharset = IsEnabled($_REQUEST['oldcharset'], $MigrateUTF8['OldCharset']);
  
  $suffix = $MigrateUTF8['TempSuffux'];
  $dofixperms = @$_POST['fixperms'];
  
  pm_session_start();
  
  list($files, $dirs, $words) = prepareboth();
  
  $q = preg_quote($UploadDir, '/');
  $oldupl = preg_grep( "/^$q\\//", array_keys($files));
  foreach($oldupl as $fpath) {
    $fname = preg_replace('!^.*/!', '', $fpath);
    if(preg_match("/[\\x80-\\xff]/", $fname)) {
      $_SESSION['HasIntlUploads'] = 1;
      break;
    }
  }
  if(@$_SESSION['HasIntlUploads']) $EnableMigrateHasUploads = 1;
  
  
  $done = "";
  if(@$_POST['recode']) {
    $EnableReadOnly = 0;
    $bct = $MigrateUTF8['BatchCheckTime'];
    
    $endtime = $Now + $MigrateUTF8['MaxSeconds'];
    $i = 0;
    

    if($files)
      while($dirs) {
        if(++$i%$bct == 0 && time()>=$endtime) {
          $EnableMigratePartial = 1;
          break;
        }
        $d = array_shift($dirs);
        $ed = mig_hlt_new($d);
        mkdirp($d);
        $done .= "* Created directory \"$ed\"\n";
        
        if(@$MigrateUTF8['TestPause']) sleep($MigrateUTF8['TestPause']);
      }
    
    
    $wdlen = strlen($WorkDir);    
    $newindex = "$WorkDir$suffix" . substr($PageIndexFile, $wdlen);
    
    if(time()<$endtime && !file_exists($newindex) && file_exists($PageIndexFile)) {
      $ofp = fopen($newindex, 'w');
      $ifp = @fopen($PageIndexFile, 'r');
      if ($ifp) {
        while (!feof($ifp)) {
          $line = fgets($ifp, 4096);
          while (substr($line, -1, 1) != "\n" && !feof($ifp)) 
            $line .= fgets($ifp, 4096);
          $line = pm_recode($line, $oldcharset, 'UTF-8');
          fputs($ofp, $line);
        }
        fclose($ifp);
      }
      fclose($ofp);
      fixperms($newindex);
      $done .= "* Recoded $PageIndexFile=>$newindex\n";
    }
    
    
    foreach($files as $source=>$dest) {
      if(++$i%$bct == 0 && time()>=$endtime) {
        $EnableMigratePartial = 1;
        break;
      }
      $esource = mig_hlt_old($source);
      $edest = mig_hlt_new($dest);
      if(substr($source, -9) == '.htaccess') {
        if(copy($source, $dest)) {
          fixperms($dest);
          $done .= "* Copied \"$esource\"=>\"$edest\"\n";
          unset($files[$source]);
        }
        else {
          $done .= "* %item red% Could not copy \"$esource\"=>\"$edest\"\n";
        }        
      }
      elseif(rename($source, $dest)) {
        
        $done .= "* Moved \"$esource\"=>\"$edest\"\n";
        
        if($dofixperms) fixperms($dest);
        
        unset($files[$source]);
      }
      else {
        $done .= "* %item red% Could not move \"$esource\"=>\"$edest\"\n";
      }
      
      if(@$MigrateUTF8['TestPause']) sleep($MigrateUTF8['TestPause']);
      
    }
    
    if(count($files) == 0) {
      $words = $dirs = array();
      
      rename($WorkDir, "$WorkDir,old");
      rename($UploadDir, "$UploadDir,old");
      rename("$WorkDir$suffix", $WorkDir);
      rename("$UploadDir$suffix", $UploadDir);
      
      $done .= "* Old '$WorkDir' moved to %red%'$WorkDir,old'%%\n";
      $done .= "* Old '$UploadDir' moved to %red%'$UploadDir,old'%%\n";
      $done .= "* New '$WorkDir$suffix' moved to %green%'$WorkDir'%%\n";
      $done .= "* New '$UploadDir$suffix' moved to %green%'$UploadDir'%%\n";
      
      if ($LastModFile && !@touch($LastModFile)) 
        { unlink($LastModFile); touch($LastModFile); fixperms($LastModFile); }
      
      $EnableMigrateDone = 1;
    }
    
  }
  
  $tableheaders = "|| class=\"simpletable sortable filterable\"\n||!Old ||!New ||\n";
  $tablerenamed = $tablemoved = $tabledirs = $tablewords = '';
  
  foreach($words as $old => $new) {
    $markedold = mig_hlt_old($old);
    $markednew = mig_hlt_new($new);
    $tablewords.= "||@@$markedold@@ ||@@$markednew@@ ||\n";
  }
  if($tablewords) $tablewords = "$tableheaders$tablewords";
  else $tablewords = "Nothing to transpose.";
  
  $newdirs = "";
  foreach($dirs as $d) {
    $d = mig_hlt_new($d);
    $newdirs .= "# $d\n";
  }
  
  if($newdirs) {
    $cntdirs = count($dirs);
    $newdirs = "(:details summary='$cntdirs new directories to be created':)\n$newdirs\n(:detailsend:)\n";
  }
  
  
  foreach($files as  $old => $new) {
    if(preg_match('/[\\x80-\\xff]+/', $old)) {
      $markedold = mig_hlt_old($old);
      $markednew = mig_hlt_new($new);
      $tablerenamed .= "||@@$markedold@@ ||@@$markednew@@ ||\n";
      $cntrenamed++;
    }
    else {
      $tablemoved .= "||@@$old@@ ||@@$new@@ ||\n";
      $cntmoved ++;
    }
  }
  
  if($tablerenamed) {
    $tablerenamed = "(:details summary='$cntrenamed files with international filenames to be renamed (recoded)':)\n$tableheaders$tablerenamed\n(:detailsend:)\n";
  }
  
  if($tablemoved) {
    $tablemoved = "(:details summary='$cntmoved files with ASCII filenames to be moved (no recode)':)\n$tableheaders$tablemoved\n(:detailsend:)\n";
  }
  
  
  
  $out = <<<EOF
(:messages:)
! Migrating wiki {\$WikiTitle} to UTF-8

(:if enabled EnableMigrateDone:)
(:div2 class=frame:)
$done

!!! The recode operation completed

Please [[$pagename|browse the wiki]] and check if everything is correct.

Remember to remove or comment out the line @@&#36;EnableReadOnly = 1;@@ from config.php.

You may also need to save your config.php and other local files in the UTF-8 encoding - check the Save-As or Tools options of your text editor.

(:if2 enabled EnableMigrateHasUploads:)
You seem to have uploads with international characters in the filenames, you may want to add to config.php:

>>hlt php<<
  &#36;UploadNameChars = "-\\\\p{L}\\\\p{N}_. ";
  &#36;MakeUploadNamePatterns = array(
    "/[^&#36;UploadNameChars]/u" => '',
    '/(\\\\.[^.]*)$/' => 'cb_tolower',
    '/^[^_\\\\p{L}\\\\p{N}]+/u' => '',
    '/[^_\\\\p{L}\\\\p{N}]+$/u' => ''
  );
>><<

See also PmWiki:UTF-8.

(:if2end:)
(:div2end:)

(:else:)
(:if2 enabled EnableMigratePartial:)
(:div2 class=frame:)
!!! Partial migration

$done

'''Migration paused at configured time limit. Please review and resubmit the form below to continue.'''
(:div2end:)

(:if2:)
This function will migrate a wiki content from an old 8-bit encoding to UTF-8.

The semi-automated function will perform these steps:
# Temporary directories will be created for wiki page files ($WorkDir%green%$suffix%%) and attachments ($UploadDir%green%$suffix%%).
# $PageIndexFile will be recoded to UTF-8.
# All page files and attachments will be moved to the temporary directories, recoding their filenames where necessary. PmWiki cache files are skipped.
** If the processing takes more than {$MigrateUTF8['MaxSeconds']} seconds, it will stop, and you will need to repost the form below to continue from where it stopped.
# The old directories will be renamed to "$WorkDir%red%,old%%" and "$UploadDir%red%,old%%" and the temporary directories will be moved to their names.

Note, this only recodes the directory names, and filenames of page files and attachments. Normally PmWiki should recode page content automatically (text, metadata, and history).

!! Preview recoding of filenames

Please preview unique words to be replaced:

$tablewords

$newdirs

$tablerenamed

$tablemoved

Please preview the above replacements. If you see any errors, you should try different source encodings in the form below, and press "Preview", until you find the correct one.

(:input form action="{\$PageUrl}?action=migr8" method=post class=frame:)
(:input hidden n $pagename:)(:input hidden action migr8:)
(:input default request=1:)
!! Control panel
(:notoc:)
!!!! Select source encoding:
(:migrate-charsets:) &nbsp; (:input submit preview Preview formnovalidate=formnovalidate:) \\\\
If some or all of the above recoded samples are wrong, please try different encodings.

(:input checkbox fixperms 1 "Fix file permissions after rename/move (slower, only enable if evidence for inaccessible pages or files)":)

!!!! Please confirm these statements to continue:

(:input checkbox have_backups 1 "I have a recent backup snapshot in case something breaks and the wiki needs to be reinstalled" required=required:)

(:input checkbox read_docs 1 "I have read the documentation at" required=required:) Cookbook:MigrateUTF8

(:input checkbox seen_preview 1 "I have reviewed the recoded strings above and they are correct" required=required:)

(:input submit recode "Recode (cannot be undone)" data-pmconfirm="This cannot be undone, except from your backups. Proceed with recoding?":)
(:input end:)

(:ifend:)
EOF;

  

  MigratePrintFmt($pagename, $out);
}

function mig_hlt_old($str) {
  return preg_replace_callback('/[\\x80-\\xff]+/', "cb_markintl", $str);
}

function mig_hlt_new($new) {
  return preg_replace('/[\\x80-\\xff]+/', "%green bgcolor=yellow%$0%%", $new);
}

function MigratePrintFmt($pagename, $out) {
  global $PageStartFmt, $PageEndFmt;
  
  DisableSkinParts('Header Footer Left Right Title');
  
    $markup = "markup:(:messages:)
! Migrating {\$WikiTitle} to UTF-8

Please review the documentation at Cookbook:MigrateUTF8 before using this tool.
----

$out

----
Please contact me at Cookbook:MigrateUTF8-Talk if you have any questions, difficulties, or suggestions for improvements.

";
  
  
  $fmt = array($PageStartFmt, 
    "<div class='migrateutf8'>", $markup, "</div>", 
    $PageEndFmt);
  
  session_write_close();
  
  PrintFmt($pagename, $fmt);
  exit;
}

function cb_markintl($m) {
  $bytes = $m[0];
  $encoded = '';
  for($i=0; $i<strlen($bytes); $i++) {
    $encoded .= '\\x'.dechex(ord($bytes[$i]));
  }
  return "%red bgcolor=yellow%$encoded%%";
}

function prepareboth() {
  global $WorkDir, $UploadDir, $MigrateUTF8;
  $oldcharset = IsEnabled($_REQUEST['oldcharset'], $MigrateUTF8['OldCharset']);
  
  list($files1, $dirs1, $words1) = preparedir($WorkDir, $oldcharset);
  list($files2, $dirs2, $words2) = preparedir($UploadDir, $oldcharset);
  
  $words = array_merge($words1, $words2); ksort($words);
  $files = array_merge($files1, $files2); ksort($files);
  $dirs = array_merge($dirs1, $dirs2); ksort($dirs);
  
  $newdirs = array();
  foreach($dirs as $dir=>$i) {
    if(!is_dir($dir)) $newdirs[] = $dir;
  }

  return array($files, $newdirs, $words);
}

function preparedir($base, $oldcharset) {
  if(! is_dir($base)) return array(array(), array(), array());
  
  $dlen = strlen($base);
  
  $files = getdir_recursive($base, $oldcharset);
  $words = intlwords(array_keys($files), $oldcharset);
  $dirs = array();
  foreach($files as $path) {
    $dir = preg_replace('!/[^/]+$!', '', $path);
    $dirs[$dir] = 1;
  }
  
  return array($files, $dirs, $words);
}


function intlwords($a, $oldcharset) {
  $words = [];
  foreach($a as $line) {
    $x = preg_split('![-/., ]!', $line);
    foreach($x as $w) {
      if(@$words[$w]) continue;
      if(preg_match('/[^-a-zA-Z0-9]/', $w)) 
        $words[$w] = pm_recode($w, $oldcharset, 'UTF-8');
    }
  }
  return $words;
}

function getdir_recursive($dir, $oldcharset) {
  global $PageIndexFile, $MigrateUTF8;
  $newdir = preg_replace('!^[^/]+!', "$0{$MigrateUTF8['TempSuffux']}", $dir);
  
  $out = [];
  $dirp = @opendir($dir);
  while (($file=readdir($dirp)) !== false) {
    if ($file=='.' || $file=='..') continue;
    if (substr($file, -6) == ',cache') continue;
    $path = "$dir/$file";
    if($PageIndexFile == $path) continue;
    if (is_dir($path)) 
      $out = array_merge($out, getdir_recursive($path, $oldcharset));
    else {
      $newpath = pm_recode("$newdir/$file", $oldcharset, 'UTF-8');
      if(!file_exists($newpath))
        $out[$path] = $newpath;
    }
  }
  closedir($dirp);
  return $out;
}

# This class can actually recode an 8-bit PageStore directory on the fly,
# however the installation and config.php modifications are more complex
# than the current MigrateUTF8 one-time operation. Also, it doesn't help 
# for uploads with international characters in their paths and filenames.
class PageStore8bit extends PageStore {
  function PFE($f) { # pagefile_encode
    global $Charset8bit;
    return pm_recode($f,'UTF-8',$Charset8bit);
  }
  function PFD($f) { # pagefile_decode
    global $Charset8bit;
    return pm_recode($f,$Charset8bit,'UTF-8');
  }
  function write($pagename,$page) { # shouldn't happen
    return Abort('?Read only page storage');
  }
}