# File lib/scraper/reader.rb, line 109
109:     def read_page(url, options = nil)
110:       options ||= {}
111:       redirect_limit = options[:redirect_limit] || REDIRECT_LIMIT
112:       raise HTTPRedirectLimitError if redirect_limit == 0
113:       if url.is_a?(URI)
114:         uri = url
115:       else
116:         begin
117:           uri = URI.parse(url)
118:         rescue Exception=>error
119:           raise HTTPInvalidURLError.new(error)
120:         end
121:       end
122:       raise HTTPInvalidURLError unless uri.scheme =~ /^http(s?)$/
123:       begin
124:         http = Net::HTTP.new(uri.host, uri.port)
125:         http.use_ssl = (uri.scheme == "https")
126:         http.close_on_empty_response = true
127:         http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
128:         path = uri.path.dup # required so we don't modify path
129:         path << "?#{uri.query}" if uri.query
130:         # TODO: Specify which content types are accepted.
131:         # TODO: GZip support.
132:         headers = {}
133:         headers["User-Agent"] = options[:user_agent] if options[:user_agent]
134:         headers["Last-Modified"] = options[:last_modified] if options[:last_modified]
135:         headers["ETag"] = options[:etag] if options[:etag]
136:         response = http.request_get(path, headers)
137:         # TODO: Ignore content types that do not map to HTML.
138:       rescue TimeoutError=>error
139:         raise HTTPTimeoutError.new(error)
140:       rescue Exception=>error
141:         raise HTTPUnspecifiedError.new(error)
142:       end
143:       case response
144:       when Net::HTTPSuccess
145:         encoding = if content_type = response["Content-Type"]
146:           if match = content_type.match(/charset=([^\s]+)/i)
147:             match[1]
148:           end
149:         end
150:         return Page[(options[:source_url] || uri), response.body, encoding,
151:                     response["Last-Modified"], response["ETag"]]
152:       when Net::HTTPNotModified
153:         return Page[(options[:source_url] || uri), nil, nil,
154:                     options[:last_modified], options[:etag]]
155:       when Net::HTTPMovedPermanently
156:         return read_page(response["location"], # New URL takes effect
157:                          :last_modified=>options[:last_modified],
158:                          :etag=>options[:etag],
159:                          :redirect_limit=>redirect_limit-1)
160:       when Net::HTTPRedirection
161:         return read_page(response["location"],
162:                          :last_modified=>options[:last_modified],
163:                          :etag=>options[:etag],
164:                          :redirect_limit=>redirect_limit-1,
165:                          :source_url=>(options[:source_url] || uri)) # Old URL still in effect
166:       when Net::HTTPNotFound
167:         raise HTTPNotFoundError
168:       when Net::HTTPUnauthorized, Net::HTTPForbidden
169:         raise HTTPNoAccessError
170:       when Net::HTTPRequestTimeOut
171:         raise HTTPTimeoutError
172:       else
173:         raise HTTPUnspecifiedError
174:       end
175:     end