项目目录
Device.java
package com.demo.webmagic.bean;
import java.util.Date;
public class Device {
private Integer id; //
private String code; //
private String name; //
private String model; //
private String manufacturer; //
private String country; //
private String contact; //
private String contactNumber; //
private String email; //
private String institute; //
private String location; //
private Object specification; //
private Object performance; //
private Object application; //
private Object description; //
private String purchaseDate; //
private String price; //
private Object feeStandard; //
private String imageUrl; //
private String imageLocal; //
private String province; //
private String dataSource; //
private String recorder; //
private Date recordDateTime; //
private String nameEn; //
private String postCode; //
public Integer getId() {
return this.id;
}
public void setId(Integer id) {
this.id = id;
}
public String getCode() {
return this.code;
}
public void setCode(String code) {
this.code = code;
}
public String getName() {
return this.name;
}
public void setName(String name) {
this.name = name;
}
public String getModel() {
return this.model;
}
public void setModel(String model) {
this.model = model;
}
public String getManufacturer() {
return this.manufacturer;
}
public void setManufacturer(String manufacturer) {
this.manufacturer = manufacturer;
}
public String getCountry() {
return this.country;
}
public void setCountry(String country) {
this.country = country;
}
public String getContact() {
return this.contact;
}
public void setContact(String contact) {
this.contact = contact;
}
public String getContactNumber() {
return this.contactNumber;
}
public void setContactNumber(String contactNumber) {
this.contactNumber = contactNumber;
}
public String getEmail() {
return this.email;
}
public void setEmail(String email) {
this.email = email;
}
public String getInstitute() {
return this.institute;
}
public void setInstitute(String institute) {
this.institute = institute;
}
public String getLocation() {
return this.location;
}
public void setLocation(String location) {
this.location = location;
}
public Object getSpecification() {
return this.specification;
}
public void setSpecification(Object specification) {
this.specification = specification;
}
public Object getPerformance() {
return this.performance;
}
public void setPerformance(Object performance) {
this.performance = performance;
}
public Object getApplication() {
return this.application;
}
public void setApplication(Object application) {
this.application = application;
}
public Object getDescription() {
return this.description;
}
public void setDescription(Object description) {
this.description = description;
}
public String getPurchaseDate() {
return this.purchaseDate;
}
public void setPurchaseDate(String purchaseDate) {
this.purchaseDate = purchaseDate;
}
public String getPrice() {
return this.price;
}
public void setPrice(String price) {
this.price = price;
}
public Object getFeeStandard() {
return this.feeStandard;
}
public void setFeeStandard(Object feeStandard) {
this.feeStandard = feeStandard;
}
public String getImageUrl() {
return this.imageUrl;
}
public void setImageUrl(String imageUrl) {
this.imageUrl = imageUrl;
}
public String getImageLocal() {
return this.imageLocal;
}
public void setImageLocal(String imageLocal) {
this.imageLocal = imageLocal;
}
public String getProvince() {
return this.province;
}
public void setProvince(String province) {
this.province = province;
}
public String getDataSource() {
return this.dataSource;
}
public void setDataSource(String dataSource) {
this.dataSource = dataSource;
}
public String getRecorder() {
return this.recorder;
}
public void setRecorder(String recorder) {
this.recorder = recorder;
}
public Date getRecordDateTime() {
return this.recordDateTime;
}
public void setRecordDateTime(Date recordDateTime) {
this.recordDateTime = recordDateTime;
}
public String getNameEn() {
return this.nameEn;
}
public void setNameEn(String nameEn) {
this.nameEn = nameEn;
}
public String getPostCode() {
return this.postCode;
}
public void setPostCode(String postCode) {
this.postCode = postCode;
}
@Override
public String toString() {
return "Device [id=" + id + ", code=" + code + ", name=" + name + ", model=" + model + ", manufacturer="
+ manufacturer + ", country=" + country + ", contact=" + contact + ", contactNumber=" + contactNumber
+ ", email=" + email + ", institute=" + institute + ", location=" + location + ", specification="
+ specification + ", performance=" + performance + ", application=" + application + ", description="
+ description + ", purchaseDate=" + purchaseDate + ", price=" + price + ", feeStandard=" + feeStandard
+ ", imageUrl=" + imageUrl + ", imageLocal=" + imageLocal + ", province=" + province + ", dataSource="
+ dataSource + ", recorder=" + recorder + ", recordDateTime=" + recordDateTime + ", nameEn=" + nameEn
+ ", postCode=" + postCode + "]";
}
}
ShanxiProcessor.java
package com.demo.webmagic.processor;
import java.util.Date;
import java.util.List;
import com.demo.webmagic.bean.Device;
import com.demo.webmagic.util.ImageDownloader;
import com.demo.webmagic.util.ImageDownloader.ImgNameType;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
public class ShanxiProcessor implements PageProcessor {
private static int currentPage = 87;
private static final int LAST_PAGE = 112;
public static final String SAVE_PATH = "D:/image/shanxi/";
public static final String DOMAIN = "http://www.tydxyq.cn";
public static final String URL_LIST = "http://www\\.tydxyq\\.cn/yqsb/list.asp\\?page=\\.*";
public static final String URL_POST = "http://www\\.tydxyq\\.cn/yqsb/detail.asp\\?ID=\\.*";
public static final String PREFIX_LIST = "http://www.tydxyq.cn/yqsb/list.asp?page=";
public static final String PREFIX_POST = "http://www.tydxyq.cn/yqsb/detail.asp?ID=";
private Site site = Site.me()
.setRetryTimes(3)
.setTimeOut(10000)
.setCharset("GBK")
.setDomain("www.tydxyq.cn")
.setSleepTime(3000)
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
@Override
public void process(Page page) {
// System.out.println(page.getHtml());
if (page.getUrl().regex(URL_LIST).match()) {
if (currentPage > LAST_PAGE) {
return;
}
addTargetRequests(page);
return;
}
if (!page.getUrl().regex(URL_LIST).match()) {
Device device = null;
try {
device = createDevice(page);
} catch (Exception e1) {
e1.printStackTrace();
}
System.out.println(device);
try {
// deviceService.add(device);
} catch (Exception e) {
e.printStackTrace();
}
}
}
private void addTargetRequests(Page page) {
List<String> urlList = page.getHtml().xpath("//tr[@align='center']").links().all();
for (String urlString : urlList) {
if (urlString.contains(PREFIX_POST)) {
page.addTargetRequest(urlString);
}
}
page.addTargetRequest(PREFIX_LIST + currentPage++);
}
private Device createDevice(Page page) {
Html html = page.getHtml();
Device device = new Device();
device.setCode(createCode(page));
String imageUrl = html.xpath("//table[4]//tr/td[2]/table[2]//a/@href").toString();
device.setImageUrl(imageUrl);
try {
String imageLocal = ImageDownloader.download(imageUrl, SAVE_PATH, ImgNameType.OBTAIN);
device.setImageLocal(imageLocal);
} catch (Exception e) {
e.printStackTrace();
}
device.setName(html.xpath("//table[4]//tr/td[2]/table[1]//tr[1]/td//strong/text()").toString().trim());
device.setModel(html.xpath("//table[4]//tr/td[2]/table[1]//tr[1]/td/text()").toString().substring(3));
// device.setUnivercity(html.xpath("//table[4]//tr/td[2]/table[1]//tr[4]/td[2]/allText()").toString().trim());
device.setPurchaseDate(html.xpath("//table[4]//tr/td[2]/table[1]//tr[9]/td[2]/text()").toString().trim());
device.setPrice(html.xpath("//table[4]//tr/td[2]/table[1]//tr[10]/td[2]/text()").toString().trim());
device.setApplication(html.xpath("//table[4]//tr/td[2]/table[1]//tr[14]/td[2]/allText()").toString().trim());
device.setFeeStandard(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[3]/td/allText()").toString());
device.setInstitute(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[4]/td/text()").toString().substring(1));
device.setContact(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[5]/td/text()").toString().substring(1));
device.setEmail(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[6]/td/text()").toString().substring(1));
device.setContactNumber(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[7]/td/allText()").toString().substring(6));
device.setSpecification(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb2']//td/allText()").toString().substring(5));
device.setCountry(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb4']//tr[1]/td/text()").toString().substring(1));
device.setManufacturer(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb4']//tr[2]/td/text()").toString().substring(1));
device.setProvince("山西省");
device.setDataSource(DOMAIN);
device.setRecorder("liuzhiguo");
device.setRecordDateTime(new Date());
return device;
}
private String createCode(Page page) {
String urlString = page.getUrl().toString();
return urlString.substring(urlString.lastIndexOf("=") + 1);
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new ShanxiProcessor()).addUrl(PREFIX_LIST + currentPage++).thread(10).run();
}
}
ImageDownloader.java
package com.demo.webmagic.util;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.UUID;
public class ImageDownloader {
public enum ImgNameType {
OBTAIN, UUID
}
public static String download(String imageUrl, String savePath, ImgNameType imgNameType) throws Exception {
if (imageUrl == null) {
return null;
}
String imageName = obtainImageName(imageUrl, imgNameType);
String imgSavePath = createImgSavePath(savePath, imageName);
if (new File(imgSavePath).exists()) {
System.out.println("图片已存在:" + imgSavePath);
return imgSavePath;
}
downloadImage(imageUrl, imgSavePath);
return imgSavePath;
}
// 图片路径中出现中文 会出错
private static void downloadImage(String imageUrl, String imgSavePath) throws Exception {
URLConnection con = new URL(imageUrl).openConnection();
con.setConnectTimeout(5 * 1000);
InputStream is = con.getInputStream();
OutputStream os = new FileOutputStream(imgSavePath);
byte[] bs = new byte[2048];
int len;
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
closeIOStream(is, os);
}
private static String createImgSavePath(String savePath, String imageName) {
File sf = createFolder(savePath);
return sf.getPath() + "\\" + imageName;
}
private static String obtainImageName(String urlString, ImgNameType imgNameType) {
if (imgNameType == ImgNameType.UUID) {
return UUID.randomUUID().toString() + ".jpg";
}
if (urlString.contains("?")) {
return urlString.substring(urlString.lastIndexOf("=") + 1) + ".jpg";
}
// if (!urlString.contains("\\.")) {
// return urlString.substring(urlString.lastIndexOf("/") + 1) + ".jpg";
// }
return urlString.substring(urlString.lastIndexOf("/") + 1);
}
private static void closeIOStream(InputStream is, OutputStream os) {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (os != null) {
try {
os.close();
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("下载完成");
}
private static File createFolder(String savePath) {
File sf = new File(savePath);
if (!sf.exists()) {
sf.mkdirs();
}
return sf;
}
}
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.demo</groupId>
<artifactId>maven-webmagic</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>maven-webmagic</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.5.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.5.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.0.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
补充返回数据为JSON格式
System.out.println(page.getJson().jsonPath("$.name"));
如果console输出有转义字符,那么就不能直接使用JSONPath,需要将其转化为JSONObject对象处理。
版权声明:本文为zhiguoliu11原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。