反复给网站编写不同的爬虫逻辑太麻烦了,自己实现了一个小框架
可以自定义的部分有:
请求方式(默认为Getuser-agent为谷歌浏览器的设置),可以通过实现RequestSet接口来自定义请求方式
储存方式(默认储存在f盘的html文件夹下),可以通过SaveUtil接口来自定义保存方式
需要保存的资源(默认为整个html页面)
筛选方式(默认所有url都符合要求),通过实现ResourseChooser接口来自定义需要保存的url和资源页面
实现的部分有:
html页面的下载方式,通过HttpClient实现html页面的下载
html页面的解析部分,通过jsoup实现html页面的解析
HtmlDownloader类,用于根据一个url下载一个html页面
package DownloadPackage; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; /* * 根据一个url下载一个html页面 */ public class HtmlDownloader { RequestSet requestset = null; public HtmlDownloader(RequestSet requestset){ this.requestset = requestset; } public String downloadhtml(String url){ String html = null; //创建一个客户端 //创建一个读取流从entity读取html BufferedReader reader = null; CloseableHttpClient httpclient = HttpClients.createDefault(); HttpResponse response = null; try { response = httpclient.execute(requestset.getMethod(url)); HttpEntity entity = response.getEntity(); reader = new BufferedReader(new InputStreamReader(entity.getContent())); StringBuilder sb = new StringBuilder(); while((html = reader.readLine()) != null){ sb.append(html); } html = sb.toString(); System.out.println("一个html页面获取成功"); } catch (IOException e) { System.out.println(url+"连接失败"); } finally{ if(reader != null){ try { reader.close(); httpclient.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } return html; } }
UrlGet类,用于根据一个html页面获得所有的url连接
package DownloadPackage; import java.util.LinkedList; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class UrlGet { public LinkedList<String> geturls(String html){ LinkedList<String> urls = new LinkedList<String>(); Document doc = Jsoup.parse(html); Elements links = doc.getElementsByTag("a"); for (Element link:links){ String url = link.attr("href"); urls.add(url); } return urls; } }
资源选择接口,需要实现三个方法,第一是isNeed方法,判断url是否为需要的,第二个是isResourse方法,判断url页面是不是需要的资源页面,第三个是process方法,
有时网页上的url是我们需要的但是格式不对,对url进行加工
package ChoosePackage; public interface ResourseChooser { public Boolean isNeed(String url); public Boolean isResourse(String url); public String process(String url); }
RequsetSet类,用于自定义请求方法的接口,实现getMethod方法获取请求方法
package DownloadPackage; import org.apache.http.client.methods.HttpGet; /* * 一个用于获得Request请求的接口 * 实现getMethod方法获取Get方法 */ public interface RequestSet { public HttpGet getMethod(String url); } Saveutil接口用于自定义保存方式,需要实现save方法 package SaveUtil; /* * 数据储存的工具接口,必须实现保存方法 */ public interface SaveUtil { public void save(String url,String html); }
Spider类,有五中构造方法,可以实现多种自定义操作,其中实现了上述自定义接口的默认实现类
package Spider; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpGet; import ChoosePackage.MyResourseChooser; import ChoosePackage.ResourseChooser; import DownloadPackage.HtmlDownloader; import DownloadPackage.RequestSet; import DownloadPackage.UrlGet; import SaveUtil.MySaveUtil; import SaveUtil.SaveUtil; /* * 用于爬取资源的类 */ public class Spider{ public static void main(String[] args) { new Spider("http://www.bilibili.net").spiderstart(); } //种子url String seed = null; //用于保存数据的类,需要自己实现 private SaveUtil saveutil = null; //html下载类 private HtmlDownloader downloader = null; //url下载类 private UrlGet urldownloader = null; //资源选择工具 private ResourseChooser resoursechooser = null; //用于保存未下载的网页 LinkedList<String> unvisited = new LinkedList<String>(); //用于保存已下载的网页 HashSet<String> visited = new HashSet<String>(); //自定义储存方式,请求方式,资源筛选方式的构造方法 public Spider(SaveUtil saveutil,RequestSet request,ResourseChooser resoursechooser,String seed){ this.saveutil = saveutil; this.downloader = new HtmlDownloader(request); this.urldownloader = new UrlGet(); this.resoursechooser = resoursechooser; this.seed = seed; unvisited.add(seed); } //自定义储存方式,资源筛选方式的构造方法 public Spider(SaveUtil saveutil,ResourseChooser resoursechooser,String seed){ this.resoursechooser = resoursechooser; this.downloader = new HtmlDownloader(new getRequest()); this.saveutil = saveutil; this.urldownloader = new UrlGet(); this.seed = seed; unvisited.add(seed); } //自定义储存方式,请求的构造方法 public Spider(SaveUtil saveutil,RequestSet requestset,String seed){ this.saveutil = saveutil; this.downloader = new HtmlDownloader(requestset); this.resoursechooser = new MyResourseChooser(); this.urldownloader = new UrlGet(); this.seed = seed; unvisited.add(seed); } //自定义储存方式的构造方法 public Spider(SaveUtil saveutil,String seed){ this.saveutil = saveutil; this.downloader = new HtmlDownloader(new getRequest()); this.resoursechooser = (new MyResourseChooser()); this.urldownloader = new UrlGet(); this.seed = seed; unvisited.add(seed); } //默认的爬虫构造方法 public Spider(String seed){ this.saveutil = new MySaveUtil(); this.downloader = new HtmlDownloader(new getRequest()); this.resoursechooser = (new MyResourseChooser()); this.urldownloader = new UrlGet(); this.seed = seed; unvisited.add(seed); } //开始爬取的方法 private void spiderstart(){ String html = null; while(!unvisited.isEmpty()){ String url = unvisited.poll(); System.out.println("开始获取"+url); if(resoursechooser.isNeed(url)){ try{ html = downloader.downloadhtml(url); } catch(RuntimeException e){ System.out.println(url+"连接获取失败"); continue; } visited.add(url); LinkedList<String> urls = new LinkedList<String>(); try{ urls = urldownloader.geturls(html); } catch(RuntimeException e){ System.out.println(url+"的html页面为空"); continue; } Iterator<String> it = urls.iterator(); while(it.hasNext()){ String newurl = it.next(); if(resoursechooser.isNeed(newurl)&&!visited.contains(newurl)&&!unvisited.contains(newurl)){ newurl = resoursechooser.process(newurl); unvisited.add(newurl); System.out.println(newurl+"加入页面"); } } System.out.println("获取了"+url+"上的所有url"); if(resoursechooser.isResourse(url)){ saveutil.save(url,html); } } } } //默认资源筛选类 private class MyResourseChooser implements ResourseChooser{ @Override public Boolean isNeed(String url) { // TODO Auto-generated method stub if(!url.startsWith("/")&&!url.startsWith("http")){ return false; } return true; } @Override public Boolean isResourse(String url) { // TODO Auto-generated method stub return true; } @Override public String process(String url) { // TODO Auto-generated method stub if(!url.startsWith("http")){ url = seed+url; } return url; } } public class getRequest implements RequestSet{ public HttpGet getMethod(String url) { // TODO Auto-generated method stub //创建一个get请求方法 HttpGet getmethod = new HttpGet(url); //HttpHost proxy = new HttpHost("124.88.67.81",80);这里不设置代理IP //设置请求超时时间等 RequestConfig responseconfig = RequestConfig.custom().setConnectionRequestTimeout(10000).setConnectTimeout(10000).setSocketTimeout(10000).build(); //设置请求头,主要是user-agent getmethod.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"); //设置请求参数 getmethod.setConfig(responseconfig); return getmethod; } } //默认的存储类 public class MySaveUtil implements SaveUtil{ @Override public void save(String url, String html) { // TODO Auto-generated method stub String filename = getfilename(url); BufferedWriter writer = null; try{ writer = new BufferedWriter(new FileWriter(filename)); writer.write(html); writer.flush(); System.out.println("文件写入成功"); } catch(IOException e){ System.out.println("文件写入失败"); } finally{ try { if(writer != null) writer.close(); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("流关闭失败"); } } } private String getfilename(String url){ String fileparentpath = "f://html"; File file = new File(fileparentpath); if(!file.exists()){ file.mkdir(); } int last = url.lastIndexOf("."); int first = url.indexOf("."); url = url.substring(first,last); url = url.replaceAll("\\.", ""); url = url.replaceAll("/", ""); return fileparentpath+"/"+url+".txt"; } } }
总结
以上就是本文关于分享一个简单的java爬虫框架的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站:Python爬虫实例爬取网站搞笑段子、Java线程之锁对象Lock-同步问题更完美的处理方式代码实例、Java编程几个循环实例代码分享等,有什么问题可以随时留言,小编会及时回复大家的。感谢朋友们对本站的支持!
声明:本文内容来源于网络,版权归原作者所有,内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎发送邮件至:notice#niaoge.com(发邮件时,请将#更换为@)进行举报,并提供相关证据,一经查实,本站将立刻删除涉嫌侵权内容。