189: def parse_page(content, encoding = nil, options = nil, parser = :tidy)
190: begin
191:
192: if meta = content.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
193: if meta = meta[0].match(/charset=([\w-]*)/i)
194: encoding = meta[1]
195: end
196: end
197: encoding ||= "utf8"
198: case (parser || :tidy)
199: when :tidy
200:
201:
202: find_tidy
203: options = (options || {}).update(TIDY_OPTIONS)
204: options[:input_encoding] = encoding.gsub("-", "").downcase
205: document = Tidy.open(options) do |tidy|
206: html = tidy.clean(content)
207: HTML::Document.new(html).find(:tag=>"html")
208: end
209: when :html_parser
210: document = HTML::HTMLParser.parse(content).root
211: else
212: raise HTMLParseError, "No parser #{parser || "unspecified"}"
213: end
214: return Parsed[document, encoding]
215: rescue Exception=>error
216: raise HTMLParseError.new(error)
217: end
218: end