Pattern p = Pattern.compile(“href\\s*=\\s*(?:”([^”]*)”|'([^’]*)’|([^”‘>\s]+))”);//这个不正确
/**
* 得到网页中图片的地址
*/
public static List<String> getImgStr(String htmlStr){
String img=””;
Pattern p_image;
Matcher m_image;
List<String> pics = new ArrayList<String>();
// String regEx_img = “<img.*src=(.*?)[^>]*?>”; //图片链接地址
String regEx_img = “<img.*src\\s*=\\s*(.*?)[^>]*?>”;
p_image = Pattern.compile
(regEx_img,Pattern.CASE_INSENSITIVE);
m_image = p_image.matcher(htmlStr);
while(m_image.find()){
img = img + “,” + m_image.group();
// Matcher m = Pattern.compile(“src=\”?(.*?)(\”|>|\\s+)”).matcher(img); //匹配src
Matcher m = Pattern.compile(“src\\s*=\\s*\”?(.*?)(\”|>|\\s+)”).matcher(img);
while(m.find()){
pics.add(m.group(1));
}
}
return pics;
}
//重点在于正则表达式 <img.*src=(.*?)[^>]*?>
// src=\”?(.*?)(\”|>|\\s+)
private final static String regxpForHtml = “<([^>]*)>”; // 过滤所有以<开头以>结尾的标签
private final static String regxpForImgTag = “<\\s*img\\s+([^>]*)\\s*>”; // 找出IMG标签
private final static String regxpForImaTagSrcAttrib = “src=\”([^\”]+)\””; // 找出IMG标签的SRC属性
String regxp = “<\\s*” +
tag +
“\\s+([^>]*)\\s*>”;
红色的 tag 是动态的变(指定标签)
1. public static String getImgStr(String htmlStr){
2. String img=””,tmp=””;
3. java.util.regex.Pattern p_image;
4. java.util.regex.Matcher m_image;
5.
6. String regEx_img = “http://[([a-z0-9]|.|/|\\-)]+.[(jpg)|(bmp)|(gif)|(png)]”;//图片链接地址
7. p_image = java.util.regex.Pattern.compile(regEx_img,java.util.regex.Pattern.CASE_INSENSITIVE);
8. m_image = p_image.matcher(htmlStr);
9. while(m_image.find()){
10. img = img + “,” + m_image.group();
11. }
12. if(img.indexOf(“,”)>=0)
13. return img.substring(1);
14. else
15. return img;
16. }
方法一:
http://www.cnblogs.com/jintan/archive/2009/10/31/1593639.html
package com.cn;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class img_src {
public static void main(String[] args) {
String html = “<html>\r\n” +
“<head><title>test</title><head>\r\n” +
“<body>” +
“<P><IMG height=\”100\” src=’abc.png’ weight=\”30\”>abcdefg” +
“<img src=’http://abc.xyz.com/123/456.jpg’ /><br>” +
“<IMG height=\”100\” \r\n” +
” src=\”abc.jpg\” \r\n” +
” weight=\”30\”>abcdefg \r\n” +
” <img src=ttt.jpg>” +
” <img src=123.jpg />” +
// “<img alt=\”src=’abc’\”>” + //这种我也无能为力
“</body></html>”;
System.out.println(getImgSrc(html));
}
public static final Pattern PATTERN = Pattern.compile(“<img\\s+(?:[^>]*)src\\s*=\\s*([^>]+)”, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
public static List getImgSrc(String html) {
Matcher matcher = PATTERN.matcher(html);
List list = new ArrayList();
while (matcher.find()) {
String group = matcher.group(1);
if (group == null) {
continue;
}
// 这里可能还需要更复杂的判断,用以处理src=”….”内的一些转义符
if (group.startsWith(“‘”)) {
list.add(group.substring(1, group.indexOf(“‘”, 1)));
} else if (group.startsWith(“\””)) {
list.add(group.substring(1, group.indexOf(“\””, 1)));
} else {
list.add(group.split(”
\\s”)[0
]);
}
}
return list;
}
}
方法二:
package com.cn;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class test {
public static void main(String[] args) {
String s = “<IMG height=55 src=\”
http://www.gobygo.com/TheGoByGo/images/book-channel.gif\
” width=210 border=0 />”;
Pattern p1 = Pattern.compile(“<IMG[\\w\\s\\d\\p{Punct}]*/>”);
Matcher m = p1.matcher(s);
while (m.find()) {
String str = m.group();
Pattern p = Pattern.compile(“src=\”[\\w\\s\\d\\p{Punct}]*\””);
Matcher m1 = p.matcher(s);
while (m1.find()) {
String str1 = m1.group();
str = str1.substring(5, str1.length() – 1);
System.out.println(str);
}
}
}
}