Thank you to anyone who has already donated - your generous donations helped make three months of treatment possible.

My brother Nate continues to fight stage IV Hodgkin's lymphoma. He's just 31, with a wife and baby girl. They have no active income (since he's been unable to return to work), no insurance, and cannot afford the treatment he needs. Nate and his family need your help. Please consider a donation, every dollar helps. Thanks.


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
<?php 

class HTML {

    /** Truncate HTML, close opened tags. UTF-8 aware, and aware of unpaired tags
     * (which don't need a matching closing tag)
     *
     * @param int $max_length Maximum length of the characters of the string
     * @param string $html
     * @param string $indicator Suffix to use if string was truncated.
     * @return string
     */
    public static function truncate($html, $max_length, $indicator = '&hellip;' )
    {
        $output_length = 0; // number of counted characters stored so far in $output
        $position = 0;      // character offset within input string after last tag/entity
        $tag_stack = array(); // stack of tags we've encountered but not closed
        $output = '';
        $truncated = false;

        /** these tags don't have matching closing elements, in HTML (in XHTML they
         * theoretically need a closing /> )
         * @see http://www.netstrider.com/tutorials/HTMLRef/a_d.html
         * @see http://www.w3schools.com/tags/default.asp
         * @see http://stackoverflow.com/questions/3741896/what-do-you-call-tags-that-need-no-ending-tag
         */
        $unpaired_tags = array( 'doctype', '!doctype',
            'area','base','basefont','bgsound','br','col',
            'embed','frame','hr','img','input','link','meta',
            'param','sound','spacer','wbr');

        // loop through, splitting at HTML entities or tags
        while ($output_length < $max_length
                && preg_match('{</?([a-z]+)[^>]*>|&#?[a-zA-Z0-9]+;}', $html, $match, PREG_OFFSET_CAPTURE, $position))
        {
            list($tag, $tag_position) = $match[0];

            // get text leading up to the tag, and store it (up to max_length)
            $text = mb_strcut($html, $position, $tag_position - $position);
            if ($output_length + mb_strlen($text) > $max_length)
            {
                $output .= mb_strcut($text, 0, $max_length - $output_length);
                $truncated = true;
                $output_length = $max_length;
                break;
            }

            // store everything, it wasn't too long
            $output .= $text;
            $output_length += mb_strlen($text);

            if ($tag[0] == '&') // Handle HTML entity by copying straight through
            {
                $output .= $tag;
                $output_length++; // only counted as one character
            }
            else // Handle HTML tag
            {
                $tag_inner = $match[1][0];
                if ($tag[1] == '/') // This is a closing tag.
                {
                    $output .= $tag;
                    // If input tags aren't balanced, we leave the popped tag
                    // on the stack so hopefully we're not introducing more
                    // problems.
                    if ( end($tag_stack) == $tag_inner )
                    {
                        array_pop($tag_stack);
                    }
                }
                else if ($tag[mb_strlen($tag) - 2] == '/'
                        || in_array(strtolower($tag_inner),$unpaired_tags) )
                {
                    // Self-closing or unpaired tag
                    $output .= $tag;
                }
                else // Opening tag.
                {
                    $output .= $tag;
                    $tag_stack[] = $tag_inner; // push tag onto the stack
                }
            }

            // Continue after the tag we just found
            $position = $tag_position + mb_strlen($tag);
        }

        // Print any remaining text after the last tag, if there's room.
        if ($output_length < $max_length && $position < mb_strlen($html))
        {
            $output .= mb_strcut($html, $position, $max_length - $output_length);
        }
        
        $truncated = mb_strlen($html)-$position > $max_length - $output_length;

        // add terminator if it was truncated in loop or just above here
        if ( $truncated )
            $output .= $indicator;

        // Close any open tags
        while (!empty($tag_stack))
            $output .= '</'.array_pop($tag_stack).'>';

        return $output;
    }

}

?>

Unit Test

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
<?php

/**
 * Test class for HTML .
 */
class HTMLTest extends PHPUnit_Framework_TestCase
{

    /**
     * Various test cases for HTML::truncate.
     * @return array
     */
    public function dataTruncate( )
    {
        return array(
            // no truncation required
            array(
                '<b>&lt;Hello&gt;</b> <img src="world.png" alt="" /> world!',
                15,
                '<b>&lt;Hello&gt;</b> <img src="world.png" alt="" /> world!',
            ),
            // truncation happens in outer text part, self closing tag and unpaired br tag,
            // and spurious </h2>s (one inside tags, one at top level)
            array(
                '<b></h2>&lt;Hello&gt;</b></h2> <br><img src="world.png" alt="" /> world!',
                10,
                '<b></h2>&lt;Hello&gt;</b></h2> <br><img src="world.png" alt="" /> w&hellip;',
            ),
            // truncation happens inside tags - close some tags afterwards
            array(
                '<hr><table><tr><td>Heck, </td><td>throw</td></tr><tr><td>in a</td><td>table</td></tr></table>',
                10,
                '<hr><table><tr><td>Heck, </td><td>thro&hellip;</td></tr></table>',
            ),
            // correct # of chars with html entities? (2 digit and 4 digit)
            array(
                '<em><b>&lt;Hello&gt;</b>&#20;&#8225;world!</em>',
                10,
                '<em><b>&lt;Hello&gt;</b>&#20;&#8225;w&hellip;</em>'
            ),
        );
    }

    /**
     * @dataProvider dataTruncate
     */
    public function testTruncate($input, $length, $output)
    {
        $result = HTML::truncate($input, $length);
        $this->assertEquals( $output, $result);
    }

}

?>