当前位置:AIGC资讯 > 数据采集 > 正文

使用httpclient、htmlcleaner 、xpath 采集新浪微博3G站点数据

package cn.mingyuan.weibo.commons;      import org.apache.http.client.HttpClient;   import org.apache.http.client.methods.HttpRequestBase;   import org.apache.http.impl.client.DefaultHttpClient;      /**   * http请求基类   *    * @author mingyuan   *    */   public abstract class RequestCommons {       protected HttpClient httpclient = null;          public RequestCommons() {           initHttpClient();       }          /**       * 初始化httpclient       */       protected void initHttpClient() {           httpclient = new DefaultHttpClient();       }          protected HttpClient getHttpClient() {           return httpclient;       }          protected void addHeader(HttpRequestBase request, String key, String value) {           request.addHeader(key, value);       }          protected void addCookie(HttpRequestBase request, String cookie) {           addHeader(request, "Cookie", cookie);       }          protected void setCookie(HttpRequestBase request, String cookie) {           request.setHeader("Cookie", cookie);       }          /**       * 设置请求的header值       *        * @param request       *            http的get或者post请求       */       protected void setHeader(HttpRequestBase request) {           request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");           request.setHeader("Accept-Language", "en-us,en;q=0.5");           request.setHeader("Connection", "keep-alive");           request.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:13.0) Gecko/20100101 Firefox/13.0.1");       }   }  

2、weibo.cn登陆

[java] view plain copy

package cn.mingyuan.weibo.login;      import java.io.IOException;   import java.io.InputStream;   import java.io.UnsupportedEncodingException;   import java.util.ArrayList;   import java.util.List;   import java.util.concurrent.TimeUnit;      import org.apache.http.Header;   import org.apache.http.HttpEntity;   import org.apache.http.HttpResponse;   import org.apache.http.NameValuePair;   import org.apache.http.client.ClientProtocolException;   import org.apache.http.client.HttpClient;   import org.apache.http.client.entity.UrlEncodedFormEntity;   import org.apache.http.client.methods.HttpGet;   import org.apache.http.client.methods.HttpPost;   import org.apache.http.message.BasicNameValuePair;   import org.apache.http.util.EntityUtils;   import org.htmlcleaner.HtmlCleaner;   import org.htmlcleaner.TagNode;   import org.htmlcleaner.XPatherException;      import cn.mingyuan.weibo.commons.RequestCommons;   import cn.mingyuan.weibo.until.Constants;      /**   * 登陆 获取cookie   *    * @author mingyuan   *    */   public class Login extends RequestCommons {       /**       * 获取登陆参数。主要有三个值:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值       *        * @return 返回登陆参数,string数组,里面的元素:第一个是表单提交地址、第二个是密码输入框的名字、第三个是vk的值       */       private String[] getLoginParameters() {           HttpClient httpClient = getHttpClient();           String location = "http://3g.sina.com.cn/prog/wapsite/sso/login.php?backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%D0%C2%C0%CB%CE%A2%B2%A9&vt=4&revalid=2&ns=1";           HttpGet get = new HttpGet(location);           setHeader(get);           HttpResponse response;           InputStream content;           String retAction = null;           String retPassword = null;           String retVk = null;           try {               response = httpClient.execute(get);               HttpEntity entity = response.getEntity();               content = entity.getContent();                  // 提取登陆参数               HtmlCleaner cleaner = new HtmlCleaner();               TagNode tagNode = cleaner.clean(content, "utf-8");               Object[] action = tagNode.evaluateXPath("//form/@action");                  if (action.length > 0) {                   retAction = action[0].toString();               }               Object[] passwordKey = tagNode.evaluateXPath("//form//input[@type='password']/@name");                  if (passwordKey.length > 0) {                   retPassword = passwordKey[0].toString();               }               Object[] vkKey = tagNode.evaluateXPath("//form//input[@name='vk']/@value");                  if (vkKey.length > 0) {                   retVk = vkKey[0].toString();               }               EntityUtils.consume(entity);           } catch (ClientProtocolException e) {               System.out.println("获取登陆页面失败,location=" + location);               e.printStackTrace();           } catch (IOException e) {               System.out.println("获取页面内容流失败");               e.printStackTrace();           } catch (XPatherException e) {               System.out.println("解析登陆参数失败");               e.printStackTrace();           } finally {               if (get != null) {                   get.releaseConnection();               }           }              System.out.println("请求页面:" + location);           System.out.println("提交地址:" + retAction);           System.out.println("密码输入框名称:" + retPassword);           System.out.println("vk值:" + retVk);              return new String[] { retAction, retPassword, retVk };       }          /**       * 提交账号密码,开始登陆       *        * @param postAction       *            登陆地址       * @param userNameValue       *            微博登陆账号       * @param passwordValue       *            微博登陆密码       * @param passwordKey       *            微博登陆框的name       * @param vkValue       *            vk的值       * @return 返回取到的cookie与跳转地址,组合成一个String数组。第一个元素为cookie,第二个元素为跳转地址       */       private String[] submitPassword(String postAction, String userNameValue, String passwordValue, String passwordKey, String vkValue) {           HttpClient httpclient = getHttpClient();           String url = "http://3g.sina.com.cn/prog/wapsite/sso/" + postAction;           System.out.println("开始提交账号密码:" + url);           HttpPost post = new HttpPost(url);           setHeader(post);           List<NameValuePair> nvps = new ArrayList<NameValuePair>();           nvps.add(new BasicNameValuePair("mobile", userNameValue));           nvps.add(new BasicNameValuePair(passwordKey, passwordValue));           nvps.add(new BasicNameValuePair("remember", "on"));           nvps.add(new BasicNameValuePair("vk", vkValue));           nvps.add(new BasicNameValuePair("backURL", "http://weibo.cn/"));           nvps.add(new BasicNameValuePair("backTitle", "新浪微博"));           nvps.add(new BasicNameValuePair("submit", "登录"));           HttpResponse response;           String cookie = null;           String location = null;           try {               post.setEntity(new UrlEncodedFormEntity(nvps));               response = httpclient.execute(post);               HttpEntity entity2 = response.getEntity();               Header[] setCookie = response.getHeaders("Set-Cookie");                  if (setCookie != null) {                   cookie = setCookie[0].getValue();                   System.out.println("获取到Cookie:" + cookie);               }               Header[] locations = response.getHeaders("Location");               if (locations != null) {                   location = locations[0].getValue();                   System.out.println("获取到跳转链接:" + location);               }               EntityUtils.consume(entity2);           } catch (UnsupportedEncodingException e1) {               e1.printStackTrace();           } catch (ClientProtocolException e) {               e.printStackTrace();           } catch (IOException e) {               e.printStackTrace();           } finally {               post.releaseConnection();           }              return new String[] { cookie, location };       }          /**       * 获取重定向页面内容       *        * @param redirectUrl       *            获取重定向页面地址       * @return 获取cookie和要跳转的地址       */       private String[] getRedirectPageInfo(String redirectUrl) {           System.out.println("开始获取跳转链接页面");           HttpGet get = new HttpGet(redirectUrl);           setHeader(get);           HttpResponse redirectResponse;           String cookie = null;           String clickHref = null;           try {               redirectResponse = httpclient.execute(get);               Header[] headers = redirectResponse.getHeaders("Set-Cookie");               if (headers != null) {                   cookie = headers[0].getValue();                   String[] splits = cookie.split(";");                      for (String str : splits) {                       if (str.startsWith("gsid_CTandWM")) {                           cookie = str;                           break;                       }                   }               }               HttpEntity entity = redirectResponse.getEntity();               InputStream content = entity.getContent();               HtmlCleaner cleaner = new HtmlCleaner();               TagNode tagNode = cleaner.clean(content, "utf-8");               Object[] clickHrefs = tagNode.evaluateXPath("//div/a/@href");                  if (clickHrefs != null) {                   clickHref = clickHrefs[0].toString();                   System.out.println("获取到跳转链接地址:" + clickHref);                  }               EntityUtils.consume(entity);           } catch (ClientProtocolException e) {               e.printStackTrace();           } catch (IOException e) {               e.printStackTrace();           } catch (XPatherException e) {               e.printStackTrace();           } finally {               get.releaseConnection();           }           return new String[] { cookie, clickHref };       }          /**       * 跳转       *        * @param cookie       *            上次请求取到的cookie       * @param redirectUrl       *            跳转url       * @return 返回跳转后取得的cookie       */       private String doRedirection(String cookie, String redirectUrl) {           HttpGet get = new HttpGet(redirectUrl);           setHeader(get);           get.setHeader("Cookie", cookie);           HttpResponse response;           try {               response = httpclient.execute(get);               HttpEntity entity = response.getEntity();               Header[] headers2 = response.getHeaders("Set-Cookie");               if (headers2 != null) {                   cookie = headers2[0].getValue();                   System.out.println("跳转页面取回的cookie:" + cookie);                   String[] splits = cookie.split(";");                   for (String str : splits) {                       if (str.startsWith("_WEIBO_UID")) {                           cookie = str;                           break;                       }                   }               }               EntityUtils.consume(entity);           } catch (ClientProtocolException e) {               e.printStackTrace();           } catch (IOException e) {               e.printStackTrace();           }           return cookie;       }          /**       * 提交账号密码,登陆       *        * @param userNameValue       *            微博账号       * @param passwordValue       *            微博密码       * @return 返回cookie       */       public String doLogin(String userNameValue, String passwordValue) {           // 获取登陆页面的参数           String[] loginParameters = getLoginParameters();           String postAction = loginParameters[0];           String passwordKey = loginParameters[1];           String vkValue = loginParameters[2];              // 提交账号密码,获取重定向页面链接与cookie           String[] cookieRedirectLocation = submitPassword(postAction, userNameValue, passwordValue, passwordKey, vkValue);           String cookie = cookieRedirectLocation[0];           String redirectUrl = cookieRedirectLocation[1];           // 获取重定向页面内容           String[] redirectInfo = getRedirectPageInfo(redirectUrl);           cookie = redirectInfo[0];           redirectUrl = redirectInfo[1];           System.out.println("准备跳转");           try {               TimeUnit.SECONDS.sleep(3);           } catch (InterruptedException e) {               e.printStackTrace();           }           System.out.println("开始跳转");           String cookieOfRedirect = doRedirection(cookie, redirectUrl);              StringBuffer sb = new StringBuffer(cookie);           sb.append(';').append(cookieOfRedirect);           System.out.println("登陆成功,最终cookie为:" + sb.toString());           return sb.toString();       }          /**       * 使用配置的账号、密码登陆       *        * @return 返回登陆cookie       */       public String doLogin() {           return this.doLogin(Constants.LOGIN_USERNAME, Constants.LOGIN_PASSWORD);       }   }  


3、测试

步骤:先登录取cookie再拿cookie填到请求里面取页面内容

[java] view plain copy

package cn.mingyuan.weibo.test;      import java.io.BufferedReader;   import java.io.IOException;   import java.io.InputStream;   import java.io.InputStreamReader;      import org.apache.http.HttpEntity;   import org.apache.http.HttpResponse;   import org.apache.http.client.ClientProtocolException;   import org.apache.http.client.methods.HttpGet;   import org.apache.http.util.EntityUtils;      import cn.mingyuan.weibo.commons.RequestCommons;   import cn.mingyuan.weibo.login.Login;   /**   * 测试,取页面内容   * @author mingyuan   *   */   public class WeiboTest extends RequestCommons {          /**       * 打印流       *        * @param in       *            InputStream       */       private void printContent(InputStream in) {           BufferedReader reader = null;           try {               reader = new BufferedReader(new InputStreamReader(in, "utf-8"));               String line;               while ((line = reader.readLine()) != null) {                   System.out.println(line);               }           } catch (Exception e) {               e.printStackTrace();           } finally {               // 这里不关闭流,留作他用           }       }          /**       * 测试,读取一个微博地址,打印出页面内容       *        * @param finalCookie       *            cookie       */       private void test(String finalCookie) {           HttpGet get = new HttpGet("http://weibo.cn/irlucene");           setHeader(get);           get.setHeader("Cookie", finalCookie.toString());           HttpResponse response;           try {               response = httpclient.execute(get);               HttpEntity entity = response.getEntity();               printContent(entity.getContent());               EntityUtils.consume(entity);           } catch (ClientProtocolException e) {               e.printStackTrace();           } catch (IOException e) {               e.printStackTrace();           } finally {               get.releaseConnection();           }       }          public static void main(String[] args) {           Login login = new Login();           String userNameValue = "username";           String passwordValue = "password";           String cookie = login.doLogin(userNameValue, passwordValue);           System.out.println("final Cookie=" + cookie);           new WeiboTest().test(cookie);       }      } 

转载:http://blog.csdn.net/telnetor/article/details/8582045

https://passport.sina.cn/signup/signup?r=http%3A%2F%2Fmy.sina.cn%2F%3Fpos%3D108%26vt%3D4%26m%3D78fc51068140045a973a3aeab4db2381

更新时间 2023-11-08