GuilinDev

Lc1236

05 August 2008

1236 Web Crawler $

BFS

O((V + E) * L) where L is maximum length of a url

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/**
 * // This is the HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * interface HtmlParser {
 *     public List<String> getUrls(String url) {}
 * }
 */
class Solution {
    public List<String> crawl(String startUrl, HtmlParser htmlParser) {
        Set<String> set = new HashSet<>();
        Queue<String> queue = new LinkedList<>();
        String hostname = getHostname(startUrl);
        
        queue.offer(startUrl);
        set.add(startUrl);
        
        while (!queue.isEmpty()) {
            String currentUrl = queue.poll();
            for (String url : htmlParser.getUrls(currentUrl)) {
                if (url.contains(hostname) && !set.contains(url)) {
                    queue.offer(url);
                    set.add(url);
                }
            }
        }
        
        return new ArrayList<String>(set);
    }
    
    private String getHostname(String Url) {
        String[] ss = Url.split("/");
        return ss[2];
    }
}

DFS, crawler一般不用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class Solution {
    public List<String> crawl(String startUrl, HtmlParser htmlParser) {
        Set<String> set = new HashSet<>(Arrays.asList(startUrl));
        String hostName = startUrl.split("/")[2];
        dfs(set,htmlParser,hostName,startUrl);
        return new ArrayList<String>(set);
    }
    
    public void dfs(Set<String> visited, HtmlParser hp, String hostName, String currentUrl){
        for (String url : hp.getUrls(currentUrl)) {
            if (url.contains(hostName) && visited.add(url)) {
                dfs(visited,hp,hostName,url);
            }
        }
    }
}