require 'anemone' require 'nokogiri' class Page < ActiveRecord::Base acts_as_tree :order => "title" def set_parent_page_from_last_breadcrumb_item #puts breadcrumb bread = Nokogiri::HTML(breadcrumb) the_link = nil bread.search('a').each do |link| puts link.content puts link['href'] the_link = 'http://birmingham.gov.uk' + link['href'] end puts 'Result of link finding: ' + the_link.to_s unless the_link.blank? possible_parent = Page.find_by_url(the_link) if possible_parent puts 'Found parent by url: ' + possible_parent.title self.parent_id = possible_parent.id self.save puts 'Saved new parent id' else puts 'No page found by url: ' + the_link new_page = Page.create_from_anemone_page Anemone::Page.fetch(the_link) if new_page self.parent_id = new_page.id self.save end end end end def self.root_page Page.find_by_url('http://birmingham.gov.uk/') end def self.setup_hierarchy # you may need to run this several times manually - sorry. pages = Page.find(:all) pages.each do |p| p.set_parent_page_from_last_breadcrumb_item end end def self.create_from_anemone_page page puts 'Create from anemone page' #begin unless page.blank? || page.doc.blank? html = page.doc.at('html') unless html.blank? found_page = Page.find_by_url(page.url.to_s) if found_page puts('Already indexed. Ignoring: ' + page.url.to_s) else title = page.doc.at('title') if title.blank? puts('Blank title. Ignoring: ' + page.url.to_s) else if(title.inner_html.include?('404') || title.inner_html.downcase.include?('error') ) puts('404 found. Ignoring: ' + page.url.to_s) else p = Page.new() p.title = title.inner_html unless title.blank? p.url= page.url.to_s p.page_source= html.inner_html breadcrumb = page.doc.css('#breadcrumb') unless breadcrumb.blank? p.breadcrumb = breadcrumb.first.inner_html end page_content = page.doc.css('#content') unless page_content.blank? p.content = page_content.first.inner_html end if p.save puts('Saved: ' + p.title + ' : ' + p.url.to_s) return p else puts("Save failed: " + p.url.to_s) end end # unless title 404 end # unless title blank end # unless page already in database end # unless html blank end # unless page/doc blank #rescue # puts('An error occurred. Continuing... ') #end #begin/rescue return false end def self.crawl_bcc start_url counter = 0 Anemone.crawl(start_url) do |anemone| anemone.focus_crawl do |page| page.links.delete_if { |x| x.to_s[/c=Page&childpagename=SystemAdmin/] } page.links.delete_if { |x| x.to_s[/pagename=BCC%252FCommon%252FWrapper%252FWrapper/] } page.links.delete_if { |x| x.to_s[/Common%252FWrapper%252FInlineWrapper/] } page.links.delete_if { |x| x.to_s[/MungoBlob/] } #page.links.delete_if { |x| !Page.find_by_url(x.to_s).blank? } end anemone.on_every_page do |page| counter = counter + 1 if Page.create_from_anemone_page end #on_every_page end # do Anemone puts(counter.to_s + ' pages saved') end #self.crawl end #class