1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
<!---	
		Author:		Tristan Lee
		Date:		12/08/10
		Purpose:	Parses ugly, crappy, terrible HTML to XHTML-valid source
		Changes:	
--->

<cfcomponent displayname="XHTMLParser" output="false">

	<cfset variables.oBAOS = "">

	<!--- Constructor --->
	<cffunction name="init" access="public" output="false" returntype="XHTMLParser">
		<cfargument name="input" required="true" type="any">
		
		<cfset var source = "">
		<cfset var oReader = "">
		<cfset var oInput = "">
		<cfset var oOSW = "">
		<cfset var oXMLWriter = "">
		<cfset var oParser = "">
		<cfset var oSchema = "">
		
		<!--- Check to see if a file path was passed in or plain string --->
		<cfif fileExists(arguments.input)>
			<cfset source = fileRead(arguments.input)>
		<cfelse>
			<cfset source = arguments.input>
		</cfif>
		
		<!--- HTML input/readers --->
		<cfset oReader = createObject("java", "java.io.StringReader").init(source)>
		<cfset oReader = createObject("java", "java.io.BufferedReader").init(oReader)>
		<cfset oInput = createObject("java", "org.xml.sax.InputSource").init(oReader)>
		
		<!--- XHTML output/writers --->
		<cfset oBAOS = createObject("java", "java.io.ByteArrayOutputStream").init()>
		<cfset oOSW = createObject("java", "java.io.OutputStreamWriter").init(oBAOS)>
		<cfset oXMLWriter = createObject("java", "org.ccil.cowan.tagsoup.XMLWriter").init(oOSW)>
		
		<!--- Processor --->
		<cfset oParser = createObject("java", "org.ccil.cowan.tagsoup.Parser").init()>
		<cfset oSchema = createObject("java", "org.ccil.cowan.tagsoup.HTMLSchema").init()>
		<cfset oSchema.setURI("")>
		
		<!--- Output settings --->
		<cfset oXMLWriter.setOutputProperty(oXMLWriter.OMIT_XML_DECLARATION, "yes")>
		<cfset oParser.setContentHandler(oXMLWriter)>
		<cfset oParser.setFeature(oParser.namespacesFeature, false)>
		<cfset oParser.setProperty(oParser.schemaProperty, oSchema)>
		
		<!--- Parse the source --->
		<cfset oParser.parse(oInput)>
		
		<cfreturn this>
	</cffunction>
	
	<cffunction name="toXHTML" access="public" output="false" returntype="string">
		<cfset var xhtml = "">
		<cfif isInstanceOf(variables.oBAOS, "java.io.ByteArrayOutputStream")>
			<cftry>
				<cfset xhtml = xmlParse(oBAOS.toString()).html.body.xmlChildren[1]>
				<cfreturn reReplace(toString(xhtml), "<\?xml[^>]*>", "", "one" )>
				<cfcatch type="any">
					<cfthrow message="Output could not be parsed to XHTML"
							 detail="#cfcatch.message# - #cfcatch.detail#">
				</cfcatch>
			</cftry>
		<cfelse>
			<cfthrow message="Object not instantiated"
					 detail="You must call init(input = """"), passing a filepath or string first">
		</cfif>
	</cffunction>
</cfcomponent>