一、引入jar包
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.5</version>
</dependency>
<!--webmagic-extension 拓展-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.5</version>
</dependency>
<!--selenium-java -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
二、下载 chromedriver.exe
三、配置模拟设定人员操作网页代码
path :chromeDriver路径
System.setProperty("webdriver.chrome.driver", path);
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--no-sandbox");//无头浏览器
chromeOptions.addArguments("--disable-gpu");//无界面
chromeOptions.addArguments("lang=zh_CN.UTF-8");
chromeOptions.setCapability("acceptSslCerts", true);
//截屏支持
chromeOptions.setCapability("takesScreenshot", true);
//css搜索支持
chromeOptions.setCapability("cssSelectorsEnabled", true);
//创建一个WebDriver //(3)建立selenium 驱动
WebDriver driver = new ChromeDriver(chromeOptions);
driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
//打开网站
driver.get(searchUrlPath);
//所有在js中的方法在这里都可以使用
Thread.sleep(1000);
Document document = Jsoup.parse(driver.getPageSource());
// 爬取完就关闭
driver.quit();
四、按照网页标签解析到对应数据
Element elementById = document.getElementById("J_main");
Element goodsList = elementById.getElementById("J_goodsList");
Elements elements = goodsList.getElementsByClass("gl-item");
for (Element el:elements){
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
String shop = el.getElementsByClass("p-shop").eq(0).text();
Elements a = el.getElementsByTag("a");
String href = a.eq(0).attr("href");
String imageStr = a.eq(0).select("img").eq(0).toString();
String imageUrl = ClimbHtmlMethodUtils.extractImageUrlTwo(imageStr);
String imageReplaceUrl = ClimbHtmlMethodUtils.replaceImageSize(imageUrl);
System.out.println("=========================");
System.out.println("标题:" + title);
System.out.println("图片:" + imageReplaceUrl);
System.out.println("店铺:" + shop);
System.out.println("价格:" + price);
System.out.println("详情地址:" + href);
}
示例代码
public static void main(String[] args) throws Exception {
String url = "https://search.jd.com/Search?keyword=apple&enc=utf-8";
parseJDDetail(url);
}
public static void parseJDDetail(String url) throws Exception {
System.setProperty("webdriver.chrome.driver", "D:\\\\file\\\\new\\\\chromedriver.exe");
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--no-sandbox");//无头浏览器
chromeOptions.addArguments("--disable-gpu");//无界面
chromeOptions.addArguments("lang=zh_CN.UTF-8");
chromeOptions.setCapability("acceptSslCerts", true);
//截屏支持
chromeOptions.setCapability("takesScreenshot", true);
//css搜索支持
chromeOptions.setCapability("cssSelectorsEnabled", true);
//创建一个WebDriver //(3)建立selenium 驱动
WebDriver driver = new ChromeDriver(chromeOptions);
driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
//打开网站
driver.get(url);
//所有在js中的方法在这里都可以使用
Thread.sleep(1000);
Document document = Jsoup.parse(driver.getPageSource());
Element elementById = document.getElementById("J_main");
Element goodsList = elementById.getElementById("J_goodsList");
Elements tableTypeElement = goodsList.getElementsByClass("ps-wrap");
for (Element el:tableTypeElement){
Elements elements = el.getElementsByTag("img");
String attr = elements.eq(0).attr("data-url");
System.out.println("标题11:" + attr);
String src = elements.eq(0).attr("data-lazy-img");
String imageReplaceUrl = ClimbHtmlMethodUtils.replaceImageSize(src);
src = StringUtils.checkNull(imageReplaceUrl) ? src : imageReplaceUrl;
System.out.println("标题222:" + src);
}
Elements elements = goodsList.getElementsByClass("gl-item");
for (Element el:elements){
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
String shop = el.getElementsByClass("p-shop").eq(0).text();
Elements a = el.getElementsByTag("a");
String href = a.eq(0).attr("href");
String imageStr = a.eq(0).select("img").eq(0).toString();
String imageUrl = ClimbHtmlMethodUtils.extractImageUrlTwo(imageStr);
String imageReplaceUrl = ClimbHtmlMethodUtils.replaceImageSize(imageUrl);
System.out.println("=========================");
System.out.println("标题:" + title);
System.out.println("图片:" + imageReplaceUrl);
System.out.println("店铺:" + shop);
System.out.println("价格:" + price);
System.out.println("详情地址:" + href);
}
//关闭服务
driver.quit();
}
微信扫描下方的二维码阅读本文

Comments NOTHING