1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
<?php 

class HTML {

    /** Truncate HTML, close opened tags. UTF-8 aware, and aware of unpaired tags
     * (which don't need a matching closing tag)
     *
     * @param int $max_length Maximum length of the characters of the string
     * @param string $html
     * @param string $indicator Suffix to use if string was truncated.
     * @return string
     */
    public static function truncate($html, $max_length, $indicator = '&hellip;' )
    {
        $output_length = 0; // number of counted characters stored so far in $output
        $position = 0;      // character offset within input string after last tag/entity
        $tag_stack = array(); // stack of tags we've encountered but not closed
        $output = '';
        $truncated = false;

        /** these tags don't have matching closing elements, in HTML (in XHTML they
         * theoretically need a closing /> )
         * @see http://www.netstrider.com/tutorials/HTMLRef/a_d.html
         * @see http://www.w3schools.com/tags/default.asp
         * @see http://stackoverflow.com/questions/3741896/what-do-you-call-tags-that-need-no-ending-tag
         */
        $unpaired_tags = array( 'doctype', '!doctype',
            'area','base','basefont','bgsound','br','col',
            'embed','frame','hr','img','input','link','meta',
            'param','sound','spacer','wbr');

        // loop through, splitting at HTML entities or tags
        while ($output_length < $max_length
                && preg_match('{</?([a-z]+)[^>]*>|&#?[a-zA-Z0-9]+;}', $html, $match, PREG_OFFSET_CAPTURE, $position))
        {
            list($tag, $tag_position) = $match[0];

            // get text leading up to the tag, and store it (up to max_length)
            $text = mb_strcut($html, $position, $tag_position - $position);
            if ($output_length + mb_strlen($text) > $max_length)
            {
                $output .= mb_strcut($text, 0, $max_length - $output_length);
                $truncated = true;
                $output_length = $max_length;
                break;
            }

            // store everything, it wasn't too long
            $output .= $text;
            $output_length += mb_strlen($text);

            if ($tag[0] == '&') // Handle HTML entity by copying straight through
            {
                $output .= $tag;
                $output_length++; // only counted as one character
            }
            else // Handle HTML tag
            {
                $tag_inner = $match[1][0];
                if ($tag[1] == '/') // This is a closing tag.
                {
                    $output .= $tag;
                    // If input tags aren't balanced, we leave the popped tag
                    // on the stack so hopefully we're not introducing more
                    // problems.
                    if ( end($tag_stack) == $tag_inner )
                    {
                        array_pop($tag_stack);
                    }
                }
                else if ($tag[mb_strlen($tag) - 2] == '/'
                        || in_array(strtolower($tag_inner),$unpaired_tags) )
                {
                    // Self-closing or unpaired tag
                    $output .= $tag;
                }
                else // Opening tag.
                {
                    $output .= $tag;
                    $tag_stack[] = $tag_inner; // push tag onto the stack
                }
            }

            // Continue after the tag we just found
            $position = $tag_position + mb_strlen($tag);
        }

        // Print any remaining text after the last tag, if there's room.
        if ($output_length < $max_length && $position < mb_strlen($html))
        {
            $output .= mb_strcut($html, $position, $max_length - $output_length);
        }
        
        $truncated = mb_strlen($html)-$position > $max_length - $output_length;

        // add terminator if it was truncated in loop or just above here
        if ( $truncated )
            $output .= $indicator;

        // Close any open tags
        while (!empty($tag_stack))
            $output .= '</'.array_pop($tag_stack).'>';

        return $output;
    }

}

?>

Unit Test

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
<?php

/**
 * Test class for HTML .
 */
class HTMLTest extends PHPUnit_Framework_TestCase
{

    /**
     * Various test cases for HTML::truncate.
     * @return array
     */
    public function dataTruncate( )
    {
        return array(
            // no truncation required
            array(
                '<b>&lt;Hello&gt;</b> <img src="world.png" alt="" /> world!',
                15,
                '<b>&lt;Hello&gt;</b> <img src="world.png" alt="" /> world!',
            ),
            // truncation happens in outer text part, self closing tag and unpaired br tag,
            // and spurious </h2>s (one inside tags, one at top level)
            array(
                '<b></h2>&lt;Hello&gt;</b></h2> <br><img src="world.png" alt="" /> world!',
                10,
                '<b></h2>&lt;Hello&gt;</b></h2> <br><img src="world.png" alt="" /> w&hellip;',
            ),
            // truncation happens inside tags - close some tags afterwards
            array(
                '<hr><table><tr><td>Heck, </td><td>throw</td></tr><tr><td>in a</td><td>table</td></tr></table>',
                10,
                '<hr><table><tr><td>Heck, </td><td>thro&hellip;</td></tr></table>',
            ),
            // correct # of chars with html entities? (2 digit and 4 digit)
            array(
                '<em><b>&lt;Hello&gt;</b>&#20;&#8225;world!</em>',
                10,
                '<em><b>&lt;Hello&gt;</b>&#20;&#8225;w&hellip;</em>'
            ),
        );
    }

    /**
     * @dataProvider dataTruncate
     */
    public function testTruncate($input, $length, $output)
    {
        $result = HTML::truncate($input, $length);
        $this->assertEquals( $output, $result);
    }

}

?>