<?php 
 
// if want to implement error collecting here, we'll need to use some sort 
// of global data (probably trigger_error) because it's impossible to pass 
// $config or $context to the callback functions. 
 
/** 
 * Handles referencing and derefencing character entities 
 */ 
class HTMLPurifier_EntityParser 
{ 
 
    /** 
     * Reference to entity lookup table. 
     */ 
    protected $_entity_lookup; 
 
    /** 
     * Callback regex string for parsing entities. 
     */ 
    protected $_substituteEntitiesRegex = 
'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/'; 
//     1. hex             2. dec      3. string (XML style) 
 
 
    /** 
     * Decimal to parsed string conversion table for special entities. 
     */ 
    protected $_special_dec2str = 
            array( 
                    34 => '"', 
                    38 => '&', 
                    39 => "'", 
                    60 => '<', 
                    62 => '>' 
            ); 
 
    /** 
     * Stripped entity names to decimal conversion table for special entities. 
     */ 
    protected $_special_ent2dec = 
            array( 
                    'quot' => 34, 
                    'amp'  => 38, 
                    'lt'   => 60, 
                    'gt'   => 62 
            ); 
 
    /** 
     * Substitutes non-special entities with their parsed equivalents. Since 
     * running this whenever you have parsed character is t3h 5uck, we run 
     * it before everything else. 
     * 
     * @param $string String to have non-special entities parsed. 
     * @returns Parsed string. 
     */ 
    public function substituteNonSpecialEntities($string) { 
        // it will try to detect missing semicolons, but don't rely on it 
        return preg_replace_callback( 
            $this->_substituteEntitiesRegex, 
            array($this, 'nonSpecialEntityCallback'), 
            $string 
            ); 
    } 
 
    /** 
     * Callback function for substituteNonSpecialEntities() that does the work. 
     * 
     * @param $matches  PCRE matches array, with 0 the entire match, and 
     *                  either index 1, 2 or 3 set with a hex value, dec value, 
     *                  or string (respectively). 
     * @returns Replacement string. 
     */ 
 
    protected function nonSpecialEntityCallback($matches) { 
        // replaces all but big five 
        $entity = $matches[0]; 
        $is_num = (@$matches[0][1] === '#'); 
        if ($is_num) { 
            $is_hex = (@$entity[2] === 'x'); 
            $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 
 
            // abort for special characters 
            if (isset($this->_special_dec2str[$code]))  return $entity; 
 
            return HTMLPurifier_Encoder::unichr($code); 
        } else { 
            if (isset($this->_special_ent2dec[$matches[3]])) return $entity; 
            if (!$this->_entity_lookup) { 
                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); 
            } 
            if (isset($this->_entity_lookup->table[$matches[3]])) { 
                return $this->_entity_lookup->table[$matches[3]]; 
            } else { 
                return $entity; 
            } 
        } 
    } 
 
    /** 
     * Substitutes only special entities with their parsed equivalents. 
     * 
     * @notice We try to avoid calling this function because otherwise, it 
     * would have to be called a lot (for every parsed section). 
     * 
     * @param $string String to have non-special entities parsed. 
     * @returns Parsed string. 
     */ 
    public function substituteSpecialEntities($string) { 
        return preg_replace_callback( 
            $this->_substituteEntitiesRegex, 
            array($this, 'specialEntityCallback'), 
            $string); 
    } 
 
    /** 
     * Callback function for substituteSpecialEntities() that does the work. 
     * 
     * This callback has same syntax as nonSpecialEntityCallback(). 
     * 
     * @param $matches  PCRE-style matches array, with 0 the entire match, and 
     *                  either index 1, 2 or 3 set with a hex value, dec value, 
     *                  or string (respectively). 
     * @returns Replacement string. 
     */ 
    protected function specialEntityCallback($matches) { 
        $entity = $matches[0]; 
        $is_num = (@$matches[0][1] === '#'); 
        if ($is_num) { 
            $is_hex = (@$entity[2] === 'x'); 
            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 
            return isset($this->_special_dec2str[$int]) ? 
                $this->_special_dec2str[$int] : 
                $entity; 
        } else { 
            return isset($this->_special_ent2dec[$matches[3]]) ? 
                $this->_special_ent2dec[$matches[3]] : 
                $entity; 
        } 
    } 
 
} 
 
// vim: et sw=4 sts=4 
 
 |