`
nkliuliu
  • 浏览: 207443 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

抓取并验证代理ip小demo

阅读更多

      年前在考虑搞个关键扩词工具,如果直接抓取google的相关搜索。用不了几次就别封ip了,设了抓取间隔时长也没用(也可能设的时候太短),没办法只能抓取代理ip了,便有了下面的小demo。可惜的是网络资源还是太少,不够抓的,真正能用的代理ip没几个。根本形不成可用的规模。代码如下给可能用到的人做个参考吧:

package com.emar.spider;

import java.util.HashMap;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.emar.core.httpClient.HttpclientUtil;
import com.emar.core.util.PropertiesUtil;

public class Proxy {
	private static final String url = "http://www.proxycn.com/html_proxy/30fastproxy-1.html";
	private static Map<String, String> ipMap = new HashMap<String, String>();
	private static final String filePath = "D:/sts/workspace-sts-2.5.1_t1/sf3a/src/main/resources/proxy.properties";

	public static Map<String, String> getProxyMap() {
		String html = "";
		boolean flag = true;
		while (flag) {
			try {
				html = HttpclientUtil.get(url, null, "GB2312");
				flag = false;
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
		Document doc = Jsoup.parse(html);
		Elements trs = doc.select("tr[onDblClick]");
		for (Element e : trs) {
			String ip = e.attr("onDblClick").replaceAll("clip", "")
					.replaceAll("已拷贝到剪贴板!", "").replaceAll("alert", "")
					.replaceAll("'", "").replaceAll(";", "")
					.replaceAll("\\(", "").replaceAll("\\)", "");
			String[] ipArray = ip.split(":");
			ipMap.put(ipArray[0], ipArray[1]);
		}
		return ipMap;
	}

	public static void writeValidProxy() {
		Map<String, String> ipMap = getProxyMap();
		System.out.println("本次共获取到的:" + ipMap.size() + "个代理");
		for (String ip : ipMap.keySet()) {
			String port = ipMap.get(ip);
			System.out.println("获取新的待检验的:" + ip + "=" + port);
			boolean flag = HttpclientUtil.checkProxy(ip, port);
			if (flag) {
				System.out.println("写入有效:" + ip + "=" + port);
				PropertiesUtil.writeProperties(filePath, ip, port);
			} else {
				System.out.println("移除失效:" + ip + "=" + port);
				PropertiesUtil.removeProperties(filePath, ip);
			}
		}
	}

	public static Map<String, String> getValidProxyMap() {
		return PropertiesUtil.readProperties(filePath);
	}

	public static void removeInvalidProxy() {
		Map<String, String> ipMap = getValidProxyMap();
		for (String ip : ipMap.keySet()) {
			String port = ipMap.get(ip);
			System.out.println("校验原有:" + ip + "=" + port);
			boolean flag = HttpclientUtil.checkProxy(ip, port);
			if (!flag) {
				System.out.println("移除失效:" + ip + "=" + port);
				PropertiesUtil.removeProperties(filePath, ip);
			}
		}
	}

	/**
	 * @param args
	 * @throws InterruptedException
	 */
	public static void main(String[] args) {
		while (true) {
			try {
				removeInvalidProxy();
				writeValidProxy();
				Thread.sleep(1000 * 60 * 30);
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}
}
 

 

 

1
5
分享到:
评论
2 楼 nkliuliu 2011-03-17  
恩,异曲同工吧。代码写的确实不简洁。
1 楼 li2005 2011-03-17  
楼主,我也玩过网页抓取代理IP,用正则表达式会很方便

相关推荐

Global site tag (gtag.js) - Google Analytics