1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
package me.dylan.WebCrawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;

import javax.swing.text.BadLocationException;
import javax.swing.text.EditorKit;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

public class WebC {
//	FileUtil f;
	int linkamount=0;
	ArrayList<URL> visited = new ArrayList<URL>();
    ArrayList<String> urls = new ArrayList<String>();
	public WebC() {
		
		try {
//			f= new FileUtil();
			sendGetRequest("http://www.google.com");
		} catch (IOException e) {
			e.printStackTrace();
		}
		catch (BadLocationException e) {
			e.printStackTrace();
		}
	}
	public static void main(String[] args) {
		new WebC();
	}
	public void sendGetRequest(String path) throws IOException, BadLocationException, MalformedURLException {

		URL url = new URL(path);
		HttpURLConnection con = (HttpURLConnection) url.openConnection();
		con.setRequestMethod("GET");
		con.setRequestProperty("Content-Language", "en-US");
	     BufferedReader rd = new BufferedReader(new InputStreamReader(con.getInputStream()));
	     EditorKit kit = new HTMLEditorKit();
	     HTMLDocument doc = (HTMLDocument)kit.createDefaultDocument();
	     doc.putProperty("IgnoreCharsetDirective", new Boolean(true));
	     kit.read(rd, doc, 0);

	     //Get all <a> tags (hyperlinks)
	     HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
	     while (it.isValid())
	     {
	         MutableAttributeSet mas = (MutableAttributeSet)it.getAttributes();
	         //get the HREF attribute value in the <a> tag
	         String link = (String)mas.getAttribute(HTML.Attribute.HREF);
	         if(!link.equals(null)) {
	        	 if(!link.startsWith("htt")) {
	        		 link="http://www.google.com"+link;
	        	 }
	        	 urls.add(link);
	 				System.out.println(linkamount++);
	         }
	        
	         it.next();
	     }
	     for(int i=urls.size()-1;i>=0;i--) {
	    	 if(urls.get(i)!=null && urls.get(i)!="") {
	 			if(/*f.searchforString(urls.get(i)) ||*/ visited.contains(new URL(urls.get(i)))) {
	 				urls.remove(i);
	 			} else {
	 				System.out.println(path);
	 				visited.add(new URL(path));
	 				//f.write(urls.get(i));
	 				sendGetRequest(urls.get(i));
	 			}
	 		     try {
	 				Thread.sleep(1000);
	 			} catch (InterruptedException e) {
	 				e.printStackTrace();
	 			}
	    	 }
	     }

	     
	}
}