jsFlags = $jsFlags; $this->htmlOptions = $htmlOptions; } function HTMLDocument($input, $insert='', $inject=false, $footer='') { // // Apply parsing that only needs to be done once.. // // Remove titles if option is enabled if ( $this->htmlOptions['stripTitle'] ) { $input = preg_replace('##is', '', $input, 1); } // Remove and record a href $input = preg_replace_callback('#]{1,1000}))(?(1)\\1|)[^>]*>#i', 'html_stripBase', $input, 1); // Proxify url= values in meta redirects $input = preg_replace_callback('#content\s*=\s*(["\\\'])?[0-9]+\s*;\s*url=([\\\'"]|&\#39;)?((?(?<=")[^"]+|(?(?<=\\\')[^\\\']+|[^\\\'" >]+)))(?(2)\\2|)(?(1)\\1|)#i', 'html_metaRefresh', $input, 1); // Process forms $input = preg_replace_callback('#]*)>(.*?)#is', 'html_form', $input); // Remove scripts blocks (avoids individual processing below) if ( $this->htmlOptions['stripJS'] ) { $input = preg_replace('#]*>.*?#is', '', $input); } // // Split up the document into its different types and parse them // // Build up new document into this var $new = ''; $offset = 0; // Find instances of script or style blocks while ( preg_match('#<(s(?:cript|tyle))[^>]*>#i', $input, $match, PREG_OFFSET_CAPTURE, $offset) ) { // What type of block is this? $block = strtolower($match[1][0]); // Start position of content $outerStart = $match[0][1]; $innerStart = $outerStart + strlen($match[0][0]); // Determine type of end tag and find it's position $endTag = ""; $innerEnd = stripos($input, $endTag, $innerStart); $outerEnd = $innerEnd + strlen($endTag); // Parse everything up till here and add to the new document $new .= $this->HTML(substr($input, $offset, $innerStart - $offset)); // Find parsing function $parseFunction = $block == 'style' ? 'CSS' : 'JS' ; // Add the parsed block $new .= $this->$parseFunction(substr($input, $innerStart, $innerEnd - $innerStart)); // Move offset to new position $offset = $innerEnd; } // And add the final chunk (between last script/style block and end of doc) $new .= $this->HTML(substr($input, $offset)); // Replace input with the updated document $input = $new; // // Now add our own code bits // // Insert our mini form after the if ( $insert !== false ) { // Check for a frameset if ( ( $useFrames = stripos($input, ']+src\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)#i', 'html_flagFrames', $input); } // Attempt to add after body $input = preg_replace('#(]*>)#i', '$1' . $insert, $input, 1, $tmp); // Check it inserted and append (if not a frameset) if ( ! $tmp && ! $useFrames ) { $input = $insert . $input; } } // Insert our javascript library if ( $inject ) { // Generate javascript to insert $inject = injectionJS(); // Add our proxy javascript after $input = preg_replace('#(]*>)#i', '$1' . $inject, $input, 1, $tmp); // If no , just prepend if ( ! $tmp ) { $input = $inject . $input; } } // Add anything to the footer? if ( $footer ) { $input = preg_replace('#(]*>)#i', $footer . '$1', $input, 1, $tmp); // If no , just append the footer if ( ! $tmp ){ $input .= $footer; } } // Return new document return $input; } // Parse HTML sections function HTML($input) { // Removing objects? Follow spec and display inner content of object tags instead. if ( $this->htmlOptions['stripObjects'] ) { // Remove all object tags (including those deprecated but still common) $input = preg_replace('#<(?>object|applet|param|embed)[^>]*>#i', '', $input, -1, $tmp); // Found any? Remove the corresponding end tags if ( $tmp ) { $input = preg_replace('#object|applet|param|embed)>#i', '', $input, $tmp); } } else { // Parse tags $input = preg_replace_callback('#]+value\s*=\s*([\\\'"])?((?(1)(?(?<=")[^"]{1,1000}|[^\\\']{1,1000})|[^\s"\\\'>]{1,1000}))(?(1)\\1|)[^>]*>#i', 'html_paramValue', $input); // To do: proxify object related URLs } // Show content within

' . $add . $input[2] . ''; } // Proxify the action="URL" value in forms function html_formAction($input) { return 'action=' . $input[1] . proxifyURL($input[2]) . $input[1]; } // Encode input names function html_inputName($input) { return 'name=' . $input[1] . inputEncode($input[2]) . $input[1]; } // Proxify URL values in attributes function html_attribute($input) { // Is this an iframe? $flag = stripos($input[0], 'iframe') === 1 ? 'frame' : ''; // URL occurred as value of an attribute and should have been htmlspecialchar()ed // We need to do the job of the browser and decode before proxifying. return str_replace($input[3], htmlspecialchars(proxifyURL(htmlspecialchars_decode($input[3]), $flag)), $input[0]); } // Flag frames in a frameset so only the first one shows the mini-form. // This could be done in the above callback but adds extra processing // when 99% of the time, it won't be needed. function html_flagFrames($input) { static $addFlag; // If it's the first frame, leave it but set the flag var if ( ! isset($addFlag) ) { $addFlag = true; return $input[0]; } // Add the frame flag $newURL = $input[2] . ( strpos($input[2], '?') ? '&f=frame' : 'fframe/'); return str_replace($input[2], $newURL, $input[0]); } /***************************************************************** * CSS callbacks ******************************************************************/ // Proxify CSS url(LOCATION) function css_URL($input) { return 'url(' . proxifyURL(trim($input[1])) . ')'; } // Proxify CSS @import "URL" function css_import($input) { return '@import "' . proxifyURL($input[1]) . '"'; } // Proxify CSS src= function css_src($input) { return 'src=' . $input[1] . proxifyURL($input[2]) . $input[1]; } // Callbacks for use with unique URLs and cached CSS // The acts as a marker for quick and easy processing later // Unique CSS url(LOCATION) function css_URL_unique($input) { return 'url()'; } // Unique CSS @import "URL" function css_import_unique($input) { return '@import ""'; } // Unique CSS src= function css_src_unique($input) { return 'src=' . $input[1] . '' . $input[1]; } /***************************************************************** * Helper functions ******************************************************************/ // Take a string, and check that the next non-whitespace char is the // passed in char (X). Return false if non-whitespace and non-X char is // found. Otherwise, return the position of X. // If $pastChar is true, ignore whitespace after finding X and return // the position of the last post-X whitespace char. function str_checknext($input, $char, $offset, $pastChar=false) { for ( $i = $offset, $length = strlen($input); $i < $length; ++$i ) { // Examine char switch ( $input[$i] ) { // Ignore whitespace case ' ': case "\t": case "\r": case "\n": break; // Accept the desired char case $char: // Move past this to the next non-whitespace? if ( $pastChar ) { return $i + strspn($input, " \t\r\n", $i+1); } // No $pastChar, just return X offset return $i; // Non-accepted char, return false default: return false; } } return false; } // Same as above but go backwards function str_checkprev($input, $char, $offset) { for ( $i = $offset; $i > 0; --$i ) { // Examine char switch ( $input[$i] ) { // Ignore whitespace case ' ': case "\t": case "\r": case "\n": break; // Accept the desired char, return current position case $char: return $i; // Non-accepted char, return false default: return false; } } return false; } // Analyze javascript and return offset positions. // Default is to find the end of the statement, indicated by: // (1) ; while not in string // (2) newline which, if not there, would create invalid syntax // (3) a closing bracket (object, language construct or function call) for which // no corresponding opening bracket was detected AFTER the passed offset // If (int) $argPos is true, we return an array of the start and end position // for the nth argument, where n = $argPos. The $start position must be just inside // the parenthesis of the function call we're interested in. function analyze_js($input, $start, $argPos = false) { // Set chars we're interested in $specialChars = ";\n\r\"'+{}()[]"; // Add , if looking for an argument position if ( $argPos ) { $specialChars .= ','; $currentArg = 1; } // Loop through the input, stopping only at special chars for ( $i = $start, $length = strlen($input), $end = false, $openObjects = $openBrackets = $openArrays = 0; $end === false && ( $i += strcspn($input, $specialChars, $i) ) && $i < $length && ( $char = $input[$i] ); ++$i ) { switch ( $char ) { // Starting string delimiters case '"': case "'": // Find the corresponding end delimiter and ensure it's not escaped while ( ( $i = strpos($input, $char, $i+1) ) && $input[$i-1] == '\\' ); // Check for false, in which case we assume the end is the end of the doc if ( $i === false ) { break 2; } break; // End of operation? case ';': $end = $i; break; // New lines case "\n": case "\r": // Newlines are OK if occuring within an open brackets, arrays or objects. if ( $openObjects || $openBrackets || $openArrays || $argPos ) { break; } // Newlines are also OK if followed by an opening function OR concatenation // e.g. someFunc\n(params) or someVar \n + anotherVar // Find next non-whitespace char position $tmp = $i + strspn($input, " \t\r\n", $i+1); // And compare to allowed chars if ( isset($input[$tmp+1]) && ( $input[$tmp+1] == '(' || $input[$tmp+1] == '+' ) ) { $i = $tmp; break; } // Newline not indicated as OK, set the end to here $end = $i; break; // Concatenation case '+': // Our interest in the + operator is it's use in allowing an expression // to span multiple lines. If we come across a +, move past all whitespace, // including newlines (which would otherwise indicate end of expression). $i += strspn($input, " \t\r\n", $i+1); break; // Opening chars (objects, parenthesis and arrays) case '{': ++$openObjects; break; case '(': ++$openBrackets; break; case '[': ++$openArrays; break; // Closing chars - is there a corresponding open char? // Yes = reduce stored count. No = end of statement. case '}': $openObjects ? --$openObjects : $end = $i; break; case ')': $openBrackets ? --$openBrackets : $end = $i; break; case ']': $openArrays ? --$openArrays : $end = $i; break; // Commas - tell us which argument it is case ',': // Ignore commas inside other functions or whatnot if ( $openObjects || $openBrackets || $openArrays ) { break; } // End now if ( $currentArg == $argPos ) { $end = $i; } // Increase the current argument number ++$currentArg; // If we're not after the first arg, start now? if ( $currentArg == $argPos ) { $start = $i+1; } break; } } // End not found? Use end of document if ( $end === false ) { $end = $length; } // Return array of start/end if ( $argPos ) { return array($start, $end); } // Return end return $end; } // Encode HTML block function encodeHTML($input) { // Escape values $s = array('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','\'',"\r","\n",'-'); $r = array('%61','%62','%63','%64','%65','%66','%67','%68','%69','%6a','%6b','%6c','%6d','%6e','%6f','%70','%71','%72','%73','%74','%75','%76','%77','%78','%79','%7a','%41','%42','%43','%44','%45','%46','%47','%48','%49','%4a','%4b','%4c','%4d','%4e','%4f','%50','%51','%52','%53','%54','%55','%56','%57','%58','%59','%5a','%27','%0d','%0a','%2D'); // Return javascript decoder return ''; }