Java爬取并自定义解析数据

Jeffrey, M 发布于 2022-08-30 308 次阅读


一、引入jar包

<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.7.5</version>
</dependency>
<!--webmagic-extension  拓展-->
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.5</version>
</dependency>

<!--selenium-java -->
<dependency>
    <groupId>org.seleniumhq.selenium</groupId>
    <artifactId>selenium-java</artifactId>
    <version>3.141.59</version>
</dependency>

二、下载 chromedriver.exe

windows10版本

linux版本下载

三、配置模拟设定人员操作网页代码

path :chromeDriver路径
System.setProperty("webdriver.chrome.driver", path);
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--no-sandbox");//无头浏览器
chromeOptions.addArguments("--disable-gpu");//无界面
chromeOptions.addArguments("lang=zh_CN.UTF-8");
chromeOptions.setCapability("acceptSslCerts", true);
//截屏支持
chromeOptions.setCapability("takesScreenshot", true);
//css搜索支持
chromeOptions.setCapability("cssSelectorsEnabled", true);
//创建一个WebDriver //(3)建立selenium 驱动
WebDriver driver = new ChromeDriver(chromeOptions);
driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
//打开网站
driver.get(searchUrlPath);
//所有在js中的方法在这里都可以使用
Thread.sleep(1000);
Document document = Jsoup.parse(driver.getPageSource());
// 爬取完就关闭
driver.quit();

四、按照网页标签解析到对应数据

Element elementById = document.getElementById("J_main");
Element goodsList = elementById.getElementById("J_goodsList");
Elements elements = goodsList.getElementsByClass("gl-item");
        for (Element el:elements){
            String price = el.getElementsByClass("p-price").eq(0).text();
            String title = el.getElementsByClass("p-name").eq(0).text();
            String shop = el.getElementsByClass("p-shop").eq(0).text();
            Elements a = el.getElementsByTag("a");
            String href = a.eq(0).attr("href");
            String imageStr = a.eq(0).select("img").eq(0).toString();
            String imageUrl = ClimbHtmlMethodUtils.extractImageUrlTwo(imageStr);
            String imageReplaceUrl = ClimbHtmlMethodUtils.replaceImageSize(imageUrl);
            System.out.println("=========================");
            System.out.println("标题:" + title);
            System.out.println("图片:" + imageReplaceUrl);
            System.out.println("店铺:" + shop);
            System.out.println("价格:" + price);
            System.out.println("详情地址:" + href);
        }

示例代码

public static void main(String[] args) throws Exception {
        String url = "https://search.jd.com/Search?keyword=apple&enc=utf-8";
        parseJDDetail(url);
    }

    public static void  parseJDDetail(String url) throws Exception {
        System.setProperty("webdriver.chrome.driver", "D:\\\\file\\\\new\\\\chromedriver.exe");
        ChromeOptions chromeOptions = new ChromeOptions();
        chromeOptions.addArguments("--headless");
        chromeOptions.addArguments("--no-sandbox");//无头浏览器
        chromeOptions.addArguments("--disable-gpu");//无界面
        chromeOptions.addArguments("lang=zh_CN.UTF-8");
        chromeOptions.setCapability("acceptSslCerts", true);
        //截屏支持
        chromeOptions.setCapability("takesScreenshot", true);
        //css搜索支持
        chromeOptions.setCapability("cssSelectorsEnabled", true);
        //创建一个WebDriver //(3)建立selenium 驱动
        WebDriver driver = new ChromeDriver(chromeOptions);
        driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
        //打开网站
        driver.get(url);
        //所有在js中的方法在这里都可以使用
        Thread.sleep(1000);
        Document document = Jsoup.parse(driver.getPageSource());
        Element elementById = document.getElementById("J_main");
        Element goodsList = elementById.getElementById("J_goodsList");
        Elements tableTypeElement = goodsList.getElementsByClass("ps-wrap");
        for (Element el:tableTypeElement){
            Elements elements = el.getElementsByTag("img");
            String attr = elements.eq(0).attr("data-url");
            System.out.println("标题11:" + attr);
            String src = elements.eq(0).attr("data-lazy-img");
            String imageReplaceUrl = ClimbHtmlMethodUtils.replaceImageSize(src);
            src = StringUtils.checkNull(imageReplaceUrl) ? src : imageReplaceUrl;
            System.out.println("标题222:" + src);
        }
        Elements elements = goodsList.getElementsByClass("gl-item");
        for (Element el:elements){
            String price = el.getElementsByClass("p-price").eq(0).text();
            String title = el.getElementsByClass("p-name").eq(0).text();
            String shop = el.getElementsByClass("p-shop").eq(0).text();
            Elements a = el.getElementsByTag("a");
            String href = a.eq(0).attr("href");
            String imageStr = a.eq(0).select("img").eq(0).toString();
            String imageUrl = ClimbHtmlMethodUtils.extractImageUrlTwo(imageStr);
            String imageReplaceUrl = ClimbHtmlMethodUtils.replaceImageSize(imageUrl);
            System.out.println("=========================");
            System.out.println("标题:" + title);
            System.out.println("图片:" + imageReplaceUrl);
            System.out.println("店铺:" + shop);
            System.out.println("价格:" + price);
            System.out.println("详情地址:" + href);
        }
        //关闭服务
        driver.quit();

    }


微信扫描下方的二维码阅读本文

此作者没有提供个人介绍
最后更新于 2022-08-30