<?php

/*
rTxt2htm 1.2.1, 22 January 2019
Copyright Santosh Patnaik
GPL v3 license
A PHP Labware internal utility - www.bioinformatics.org/phplabware/internal_utilities

See rTxt2htm_README.txt or rTxt2htm_README.htm for more

rTxt2htm creates HTML files from text files with special but simple and unobtrusive markup. It is intended for generating HTML versions of plain-text documentation (like 'readme' files). 

Formatters (processing done in shown order) are:

- block of text with '+-----(5 or more)+' at top and at bottom is rendered as plain, mono-spaced text for tables, ASCII diagrams, etc.; rest of formatters don't apply to its content
- block of text with '== Content ==(any number of)' at top and >1 empty lines at bottom is considered a table of content (TOC); rest of formatters except `, * and ' don't apply to its content
- block of text flanked with a line with '/*'-style PHP comment markers at top and at bottom = a 'subtle' div
- lines like '@@title:...' are used to identify title, language, encoding, keywords and description, and removed
- 4 spaces before sentence = sentence shown as code (a tab is 4 spaces)
- 'term' = term given different appearance using 'span class="term"'; `, * and : are neutralized
- `term` = italics
- *term* = bold
- one_word1:- 1 space one_word2 = one_word1 hyperlinked to one_word2; use #one_word2 for anchors
- term with http:, https:, mailto:, ftp:, sftp:, file: = link created
- empty line and == (optional number) (text) ==(any no. of times)(optional o's) on next line = section div start; text in h2; no. of o's dictate number of previous opened divs to be closed; number like '1' and '3.2.1'; the headings get an anchor named same as the number but prefixed with 's', like 's1' and 's3.2.1'
- empty line and -- (optional number) (text) --(any no. of times)(optional o's) on next line = like above but for sub-section and h3
- empty line and .. (optional number) (text) ..(any no. of times)(optional o's) on next line = like above but for sub-sub-section and h4
- empty line and _____ (5 or more underscores) on next line by themselves = <hr>; optional o's at end for div closures

To try, use following:

== Content ===========================

1 Check section
  1.1 `Check` *sub-section*
    1.1.1 Check sub-sub-section
  1.2 Check section named 'bløf Charlène'
2  Check another section
  (2A) Alphanumeric identifier


== 1  Check section ==================

Check 'this' & '<this>'

+--------------------------+
Simple table
============

ID  Name     Age
..  ....     ...

1   John     23
2   Ram       8
+--------------------------+

-- 1.1 `Check` *sub-section* -------------

Check `this` and *this* and `*this*`!

.. 1.1.1 Check sub-sub-section .......

Check these:

*  for URLs, see section:- #2
*  here is CNN:- http://cnn.com (here:- http://abc.com is ABC)
*  send me mail:- mailto:hello@me.com

-- 1.2 Check section named 'bløf Charlène' ------------------------oo

    Check code with 'this' `this` *this* 
    
Check a horizontal rule

______
    
    
== 2  Check another section =========oo

*  http://www.cnn.com
*  mailto:someone@somewhere.com
*  file://a_file

Check some non-English characters:

*  'Bløf Charlène', *bløf Charlène*, `bløf Charlène`, `bløf Charlène`
*  Charlène:- where? Charlène:- where?
*  *¥ · £ · € · $ · ¢ · ₡*


-- (2A) Alphanumeric identifier -------

_____________________________________oo

@@title: example text

*/

// time-limit
set_time_limit(360);

// errors
error_reporting(E_ALL | (defined('E_STRICT') ? E_STRICT : 0));
ini_set('display_errors', 1); // 1 to debug

// defaults
$title = 'rTxt2htm : convert text to HTML';
$text_file = '';
$maxsize = 500000;
$meta_kword = 'rTxt2htm, text, HTM/HTML, converter, conversion, convert, PHP, Labware, rst, reStructured';
$meta_desc = 'rTxt2htm from PHP Labware converts plain text to HTML';
$lang = 'en'; // IANA-recognized language
$enc = 'utf-8'; // best if same as plain-text file's encoding; IANA-recognized charset encoding
$action = $direct = 0;
$css = "
a {text-decoration:none; color: blue;}
a:hover {color: red;}
a:visited {color: blue;}
body {margin: 0; padding: 0;}
body, div, html, p {font-family: Georgia, 'Times new roman', Times;}
code.code {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;}
div.comment {padding: 5px; color: #999999; font-size: 80%;}
div.comment a {color: #6699cc;}
div#body {width: 70%; margin: 5px; padding: 5px;} /* holds non-toc content */
div#toc {position: fixed; top: 5px; left: 73%; z-index: 2; margin-top: 5px; margin-left: 5px; border: 1px solid gray; padding: 5px; background-color: #ededed; width: 23%; overflow: auto; max-height:94%; font-size: 90%;} /* holds content table (toc) */
div#top {font-size: 14px; margin: 5px; padding: 5px;} /* holds all content */
div.monospace {overflow: auto; font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;}
div.sub-section {padding-left: 15px;}
div.sub-sub-section {padding-left: 30px;}
h1 {font-size: 22px; margin-top: 5px; margin-bottom: 5px;}
h2 {font-size: 20px; float: left; margin-top: 15px; margin-bottom: 5px;}
h3 {font-size: 18px; float: left; margin-top: 15px; margin-bottom: 5px;}
h4 {font-size: 16px; float: left; margin-top: 15px; margin-bottom: 5px;}
hr {margin-top: 15px; margin-bottom: 5px;}
input, textarea {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;}
p.subtle {color: gray; padding: 0; padding-top: 10px; margin: 0;}
p.subtle a, p.subtle a:visited {color: #6699cc;}
span.item-no {color: black;}
span.subtle {color: gray; margin: 0; padding:0;}
span.subtle a, span.subtle a:visited {color: #6699cc;}
span.term {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;}
span.toc-item {color: black;}
span.totop {float: right; margin-top: 15px; margin-bottom: 5px;}
span.totop a, span.totop a:visited {color: #6699cc;}
@media screen { /* fixes for old IE */
 * html, * html body {overflow-y: auto!important; height: 100%; margin: 0; padding: 0;}
 * html div#body {height: 100%; overflow-y: auto; position: relative;}
 * html div#toc {position: absolute;}
}
";

// form values
if(isset($_POST['action']) && $_POST['action'] == 1){
 $action = 1; 
 if(get_magic_quotes_gpc()){
  foreach($_POST as $k => $v){
   $_POST[$k] = stripslashes($v);
  }
  ini_set('magic_quotes_gpc', 0);
 }
 if(version_compare(PHP_VERSION, '5.3.0', '<')){
  set_magic_quotes_runtime(0);
 }
 $css = isset($_POST['css'][0]) ? $_POST['css'] : $css;
 $direct = isset($_POST['direct']) ? (bool)$_POST['direct'] : $direct;
 $enc = isset($_POST['enc'][0]) ? $_POST['enc'] : $enc;
 $lang = isset($_POST['lang'][0]) ? $_POST['lang'] : $lang;
 $meta_desc = isset($_POST['meta_desc'][0]) ? $_POST['meta_desc'] : $meta_desc;
 $meta_kword = isset($_POST['meta_kword'][0]) ? $_POST['meta_kword'] : $meta_kword;
 $title = isset($_POST['title'][0]) ? $_POST['title'] : $title;
 if((int)$_FILES['text_file']['size'] > 0 && (int)$_FILES['text_file']['size'] < $maxsize){
  $_FILES['text_file']['name'] = str_replace(array("\0", '\\', ':'), '', $_FILES['text_file']['name']);
  $text_file = empty($_FILES['text_file']['name']) ? 'readme' : $_FILES['text_file']['name'];
  $t = file_get_contents($_FILES['text_file']['tmp_name']);
 }elseif(isset($_POST['text_in'][0])){
  $t = substr($_POST['text_in'], 0, $maxsize);
 }else{
  $action = 0;
 }
}

if(isset($t)){
 // speed is not of concern
 
 // hide special chars <, > and &; make tab = 4 spaces; standardize line-breaks; to insert real line-breaks later, we will use \r (since \n will be converted to HTML <br>)
 $t = str_replace(array('<', '>', '&', "\t", "\r\n", "\r"), array("\x01", "\x02", "\x03", '    ', "\n", "\n"), $t);

 // remove spaces before line-breaks; usually unintended; if present, hamper regex matching
 $t = preg_replace('` *\n`m', "\n", $t);

 // unslash - for strange behavior with e modifier in preg_replace
 $unslash = 1;
 function unslash($w){
  global $unslash;
  if(!$unslash){return $w;}
  $x = 'He wrote "This".';
  $y = preg_replace_callback('`(^He )(wrote)( "This".)$`', function($p){return $p[1]. $p[2]. $p[3];}, $x);
  if($x != $y){return str_replace('\"', '"', $w);}
  else{
   $unslash = 0;
   return $w;
  } 
 }
 
 // non-formatted or pre
 $t = preg_replace_callback('`(?:^|\n) *\+-{5,}\+\n(.*?)\n *\+-{5,}\+\n(?=\n|\r|$)`sm', function($p){return "\r\n". '<div class="monospace">'. str_replace(array("*", "'", "`", "-", ":", ".", "=", "_", " "), array("&#42;", "&#39;", "&#96;", "&#45;", "&#58;", "&#46;", "&#61;", "&#95;", "&#160;"), unslash($p[1])). '</div>'. "\r";}, $t);
 
 // extract TOC into separate variable; thus other formatters don't apply anymore to TOC content; hyperlink TOC items; items have numeric identifiers or alphanumeric ones inside round brackets pointing to relevant sections
 $toc = '';
 if(preg_match('`(?:^|\n)== *Content *==*\n*(.*?)\n\n`sm', $t, $m)){
  $t = preg_replace('`(?:^|\n)== *Content *==*\n* *(.*?)\n\n`sm', "\n", $t);
  $toc = preg_replace_callback('`(^|\s[\([]?)\'([^\s][^\'\n\r]*)\'(?=[:;!?,.)\]]*\s|$)`', function($p){return $p[1]. '<span class="term">'. str_replace(array("*", "`", ":"), array("&#42;", "&#96;", "&#58;"), unslash($p[2])). '</span>';}, $m[1]);
  $toc = preg_replace('/(^|\s[\([]?)`([^\s][^`\n\r]*)`(?=[:;!?,.)\]]*\s|$)/', "$1<em>$2</em>", $toc);
  $toc = preg_replace('`(^|\s[\([]?|<em>)\*([^\s][^*\n\r]*)\*(?=[:;!?,.)\]]*\s|</em>|$)`', "$1<strong>$2</strong>", $toc);
  $toc = "\r". '<div id="toc">'. preg_replace_callback('`^( *)(\([a-zA-Z\d.]+\)|[\d.]+)?(.*)`m', function($p){return $p[1]. '<span class="toc-item">'. (strlen($p[2]) ? '<a href="#s'. str_replace(array("(", ")"), "_", $p[2]). '">' : ''). '<span class="item-no">'. str_replace(array("(", ")"), "", $p[2]). '</span>'.  unslash($p[3]). (strlen($p[2]) ? '</a>' : ''). '</span>';}, $toc). "</div><!-- ended div toc -->\r";
 }

 // /*-style PHP comment blocks put into a subtle div; replacing @, `, ', - and * with entities as they are formatter characters and not needed here, and we are allowing regular links but not those with ':- '; we add an extra space before the '</div>' in case the last word in the block is a regular link (regular link regex will fail with '<' of '</div>')
 $t = preg_replace_callback('`(?:^|\n) */\*\n(.+?)\n *\*/\n(?=\n|\r|$)`ms', function($p){return "\r\n". '<div class="comment">' . str_replace(array("*", "'", "`", "-", "@"), array("&#42;", "&#39;", "&#96;", "&#45;", "&#64;"), unslash($p[1])). ' </div>'. "\r";}, $t);
 
 // auto-identify title, etc. 
 if(preg_match('`\n@@encoding:(.*)`m', $t, $m)){
  $enc = trim($m[1]);
  $t = preg_replace('`\n@@encoding:(.*)`m', '', $t);
 }
 if(preg_match('`\n@@language:(.*)`m', $t, $m)){
  $lang = trim($m[1]);
  $t = preg_replace('`\n@@language:(.*)`m', '', $t); 
 }
 if(preg_match('`\n@@description:(.*)`m', $t, $m)){
  $meta_desc = trim($m[1]);
  $t = preg_replace('`\n@@description:(.*)`m', '', $t);
 }
 if(preg_match('`\n@@keywords:(.*)`m', $t, $m)){
  $meta_kword = trim($m[1]);
  $t = preg_replace('`\n@@keywords:(.*)`m', '', $t);  
 }
 if(preg_match('`\n@@title:(.*)`m', $t, $m)){
  $title = trim($m[1]);
  $t = preg_replace('`\n@@title:(.*)`m', '', $t);
 }  
}

if($direct){
 ob_start();
}
?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="<?php echo $lang; ?>" lang="<?php echo $lang; ?>">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=<?php echo htmlspecialchars($enc); ?>" />
<meta http-equiv="Content-Language" content="en" />
<meta name="description" content="<?php echo htmlspecialchars($meta_desc. ' - '. $text_file); ?> - presented with rTxt2htm, a PHP Labware utility" />
<meta name="keywords" content="<?php echo htmlspecialchars($meta_kword. ', '. $text_file); ?>, rTxt2htm, PHP Labware" />
<style type="text/css" media="all">
<!--/*--><![CDATA[/*><!--*/
<?php echo htmlspecialchars($css); ?>
/*]]>*/-->
</style>
<title><?php echo htmlspecialchars($title. ' | '. $meta_desc); ?></title>
</head>
<body>
<div id="top">
<h1><a id="peak" name="peak"></a><?php echo htmlspecialchars($title); ?></h1>
<?php if($action){

// line with 4 leading spaces put into a code div; replacing `, ', * and : with entities as they are formatter characters and not needed here
$t = preg_replace_callback('`(?<=^|\n|\r)    (?:[^\n]+?)(?=\n|\r|$)`m', function($p){return "\r". '<code class="code">' . str_replace(array("*", "'", "`", ":"), array("&#42;", "&#39;", "&#96;", "&#58;"), unslash($p[0])). '</code>'. "\r";}, $t);

// 'special' text, italics and bold; allowing for some punctuation marks
$t = preg_replace_callback('`(^|\s[\([]?)\'([^\s][^\'\n\r]*)\'(?=[:;!?,.)\]]*\s|$)`', function($p){return $p[1]. '<span class="term">'. str_replace(array("*", "`", ":"), array("&#42;", "&#96;", "&#58;"), unslash($p[2])). '</span>';}, $t);
$t = preg_replace('/(^|\s[\([]?)`([^\s][^`\n\r]*)`(?=[:;!?,.)\]]*\s|$)/', "$1<em>$2</em>", $t);
$t = preg_replace('`(^|\s[\([]?|<em>)\*([^\s][^*\n\r]*)\*(?=[:;!?,.)\]]*\s|</em>|$)`', "$1<strong>$2</strong>", $t);

// links of type 'word:- resource'
$t = preg_replace_callback('`(^|\s[\([]?)(\S+?):- ([#a-zA-Z_\-0-9./](?:\S*)[a-zA-Z_\-0-9/])(?=[:;!?,.)\]]*\s|$)`', function($p){return $p[1]. '<a href="'. (substr(unslash($p[3]), 0, 1) == '#' ? '#s'. substr(unslash($p[3]), 1) : unslash($p[3])). '">'. unslash($p[2]). (substr(unslash($p[3]), 0, 1) == '#' ? ' '. substr(unslash($p[3]), 1) : ''). '</a>';}, $t);

// regular links
$t = preg_replace('`(^|\s[\([]?)((?:http|https|mailto|ftp|sftp|file):(?:[#a-zA-Z_\-0-9./](?:\S+)[a-zA-Z_\-0-9/]))(?=[:;!?,.)\]]*\s|$)`', '$1<a href="$2">$2</a>', $t);

// sections, sub-sections and sub-sub-sections; create anchors named with section number with 's' prefix (XHTML spec: IDs start with a letter); add a 'to top' link; close any open divs as indicated by terminating o's
$t = preg_replace_callback('`(?:\r|\n)\n==+ *(\([a-zA-Z\d.]+\)|[\d.]+)?(.*?) *=+(o*)\n(\n(?=\n))?`', function($p){return str_repeat("\r</div>", strlen(unslash($p[3]))). "\r". '<div class="section"><h2>'. "\r". (strlen(unslash($p[1])) ? '<a name="s'. str_replace(array("(", ")"), "_", $p[1]). '" id="s'. str_replace(array("(", ")"), "_", $p[1]). '"></a>' : ''). '<span class="item-no">'. str_replace(array("(", ")"), "", $p[1]). '</span>'. unslash($p[2]). "\r". '</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />'. "\r";}, $t);
$t = preg_replace_callback('`(?:\r|\n)\n--+ *(\([a-zA-Z\d.]+\)|[\d.]+)?(.*?) *-+(o*)\n(\n(?=\n))?`', function($p){return str_repeat("\r</div>", strlen(unslash($p[3]))). "\r". '<div class="sub-section"><h3>'. "\r". (strlen(unslash($p[1])) ? '<a name="s'. str_replace(array("(", ")"), "_", $p[1]). '" id="s'. str_replace(array("(", ")"), "_", $p[1]). '"></a>' : ''). '<span class="item-no">'. str_replace(array("(", ")"), "", $p[1]). '</span>'. unslash($p[2]). "\r". '</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />'. "\r";}, $t);
$t = preg_replace_callback('`(?:\r|\n)\n\.\.+ *(\([a-zA-Z\d.]+\)|[\d.]+)?(.*?) *\.+(o*)\n(\n(?=\n))?`', function($p){return str_repeat("\r</div>", strlen(unslash($p[3]))). "\r". '<div class="sub-sub-section"><h4>'. "\r". (strlen(unslash($p[1])) ? '<a name="s'. str_replace(array("(", ")"), "_", $p[1]). '" id="s'. str_replace(array("(", ")"), "_", $p[1]). '"></a>' : ''). '<span class="item-no">'. str_replace(array("(", ")"), "", $p[1]). '</span>'. unslash($p[2]). "\r". '</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />'. "\r";}, $t);

// put non-TOC content into 'body' div; also care for <hr>
$t = $toc. "\r". '<div id="body">'. preg_replace_callback('`(?:\r|\n)\n *__{4,}(o*)(?=\r|\n|$)`', function($p){return str_repeat("\r</div>", strlen(unslash($p[1]))). "\r\n". '<hr />';}, $t);

// preserving spaces
$t = str_replace('  ', ' &#160;', preg_replace('`(?<=^|\r|\n|>) `', '&#160;', $t));

// \n to <br>
$t = str_replace("\n", "<br />\n", rtrim($t));

// entitify <, > and & and bring them out (hidden so far as special characters)
$t = str_replace(array("\x01", "\x02", "\x03"), array('&lt;', '&gt;', '&amp;'), $t);

// intended real line-breaks; \r proxying so far
$t = str_replace("\r", "\n", $t);

echo $t;
?>
<br /><br /><span class="subtle"><small>HTM version of <?php echo '<em><a href="', htmlspecialchars($text_file), '">', htmlspecialchars($text_file), '</a></em> generated on ', gmdate('d M, Y'); ?> using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span>
</div><!-- ended div body -->
<?php }else // show form
{
 echo '<hr />
 <br />
 <form id="form" enctype="multipart/form-data" action="rTxt2htm.php" method="post">
 <div><input type="hidden" name="action" value="1" id="action" />
 <table summary="form" cellpadding="2">
 <tr><td style="text-align: right">File:</td><td style="text-align: left"><input type="hidden" name="MAX_FILE_SIZE" id="MAX_FILE_SIZE" value="', htmlspecialchars($maxsize), '" /><input type="file" name="text_file" id="text_file" /></td><td style="text-align: left"><span class="subtle">upload the text-file; upto ', htmlspecialchars($maxsize/1000), ' KB</span></td></tr>
 <tr><td style="text-align: right">Or type/paste:</td><td style="text-align: left"><textarea name="text_in" id="text_in" rows="5" cols="50"></textarea></td><td style="text-align: left"><span class="subtle">text input in right format; upto ', htmlspecialchars($maxsize), ' chars</span></td></tr>
 <tr><td style="text-align: right">Character encoding:</td><td style="text-align: left"><input type="text" size="50" name="enc" value="', htmlspecialchars($enc), '" id="enc" /></td><td style="text-align: left"><span class="subtle">best if same as file\'s; <a href="http://www.iana.org/assignments/character-sets">IANA-recognized</a> value; * (any auto-discovered value will overwrite it)</span></td></tr>
 <tr><td style="text-align: right">CSS style:</td><td style="text-align: left"><textarea rows="5" cols="50" name="css" id="css">', htmlspecialchars($css), '</textarea></td><td style="text-align: left"><span class="subtle">need not be changed</span></td></tr>
 <tr><td style="text-align: right">Description:</td><td style="text-align: left"><input type="text" size="50" name="meta_desc" value="', htmlspecialchars($meta_desc), '" id="meta_desc" /></td><td style="text-align: left"><span class="subtle">*</span></td></tr>
 <tr><td style="text-align: right">Keywords:</td><td style="text-align: left"><input type="text" size="50" name="meta_kword" value="', htmlspecialchars($meta_kword), '" id="meta_kword" /></td><td style="text-align: left"><span class="subtle">*</span></td></tr>
 <tr><td style="text-align: right">Language:</td><td style="text-align: left"><input type="text" size="50" name="lang" value="', htmlspecialchars($lang), '" id="lang" /></td><td style="text-align: left"><span class="subtle"><a href="http://www.iana.org/assignments/language-subtag-registry">IANA-recognized</a> value; *</span><br /></td></tr>
 <tr><td style="text-align: right">Title:</td><td style="text-align: left"><input type="text" size="50" name="title" value="', htmlspecialchars($title), '" id="title" /></td><td style="text-align: left"><span class="subtle">*</span><br /></td></tr>
 <tr><td>&nbsp;</td><td colspan="2" style="text-align: left;"><input type="submit" value="Convert" /> <input type="checkbox" id="direct" name="direct" value="1" />Direct download</td></tr>
 </table>
 </div>
 </form>
 <span class="subtle" style="float: right;"><a href="rTxt2htm_README.txt">txt</a> / <a href="rTxt2htm_README.htm">htm</a> documentation | <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</span><hr style="clear: both;" />';
}
?></div><!-- ended div top -->
</body>
</html><?php
if($action && $direct){
 $out = ob_get_contents();
 ob_end_clean();
 $extn = substr(strrchr($text_file, '.'), 1);
 if($extn == 'txt' or $extn == 'text' or $extn == 'doc' ){
  $htm_file = substr($text_file, 0, -strlen($extn)). 'htm';
 }
 else{
  $htm_file = $text_file. '.htm';
 }
 header('Accept-Ranges: bytes');
 header('Content-Type: text/html; charset='. $enc);
 header("Content-Transfer-Encoding: binary\n");
 header('Content-Disposition: attachment; filename="'.$htm_file.'"');
 echo $out;
 exit;
}