def HTree.scan(input, is_xml=false)
is_html = false
cdata_content = nil
text_start = 0
first_element = true
input.scan(/(#{Pat::XmlDecl})
|(#{Pat::DocType})
|(#{Pat::XmlProcIns})
|(#{Pat::StartTag})
|(#{Pat::EndTag})
|(#{Pat::EmptyTag})
|(#{Pat::Comment})
|(#{Pat::CDATA})/ox) {
match = $~
if cdata_content
str = $&
if $5 && str[Pat::Name] == cdata_content
text_end = match.begin(0)
if text_start < text_end
yield [:text_cdata_content, input[text_start...text_end]]
text_start = match.end(0)
end
yield [:etag, str]
cdata_content = nil
end
else
str = match[0]
text_end = match.begin(0)
if text_start < text_end
yield [:text_pcdata, input[text_start...text_end]]
end
text_start = match.end(0)
if match.begin(1)
yield [:xmldecl, str]
is_xml = true
elsif match.begin(2)
Pat::DocType_C =~ str
is_html = true if /\Ahtml\z/i =~ $1
yield [:doctype, str]
elsif match.begin(3)
yield [:procins, str]
elsif match.begin(4)
yield stag = [:stag, str]
tagname = str[Pat::Name]
if first_element
if /\A(?:html|head|title|isindex|base|script|style|meta|link|object)\z/i =~ tagname
is_html = true
else
is_xml = true
end
first_element = false
end
if !is_xml && ElementContent[tagname] == :CDATA
cdata_content = tagname
end
elsif match.begin(5)
yield [:etag, str]
elsif match.begin(6)
yield [:emptytag, str]
first_element = false
elsif match.begin(7)
yield [:comment, str]
elsif match.begin(8)
yield [:text_cdata_section, str]
else
raise Exception, "unknown match [bug]"
end
end
}
text_end = input.length
if text_start < text_end
if cdata_content
yield [:text_cdata_content, input[text_start...text_end]]
else
yield [:text_pcdata, input[text_start...text_end]]
end
end
return is_xml, is_html
end