JAVA正则表达式,提取img的src问题!!!

  • Post author:
  • Post category:java





Pattern  p = Pattern.compile(“href\\s*=\\s*(?:”([^”]*)”|'([^’]*)’|([^”‘>\s]+))”);//这个不正确









/**

* 得到网页中图片的地址

*/

public static List<String> getImgStr(String htmlStr){

String img=””;

Pattern p_image;

Matcher m_image;

List<String> pics = new ArrayList<String>();

//     String regEx_img = “<img.*src=(.*?)[^>]*?>”; //图片链接地址




String regEx_img = “<img.*src\\s*=\\s*(.*?)[^>]*?>”;

p_image = Pattern.compile

(regEx_img,Pattern.CASE_INSENSITIVE);

m_image = p_image.matcher(htmlStr);

while(m_image.find()){

img = img + “,” + m_image.group();

// Matcher m  = Pattern.compile(“src=\”?(.*?)(\”|>|\\s+)”).matcher(img); //匹配src



Matcher m  = Pattern.compile(“src\\s*=\\s*\”?(.*?)(\”|>|\\s+)”).matcher(img);




while(m.find()){


pics.add(m.group(1));

}

}

return pics;

}

//重点在于正则表达式 <img.*src=(.*?)[^>]*?>

//               src=\”?(.*?)(\”|>|\\s+)




private final static String regxpForHtml = “<([^>]*)>”; // 过滤所有以<开头以>结尾的标签


private final static String regxpForImgTag = “<\\s*img\\s+([^>]*)\\s*>”; // 找出IMG标签


private final static String regxpForImaTagSrcAttrib = “src=\”([^\”]+)\””; // 找出IMG标签的SRC属性

String regxp = “<\\s*” +

tag +

“\\s+([^>]*)\\s*>”;

红色的 tag 是动态的变(指定标签)


1. public static String getImgStr(String htmlStr){

2.         String img=””,tmp=””;

3.         java.util.regex.Pattern p_image;

4.         java.util.regex.Matcher m_image;

5.

6.         String regEx_img = “http://[([a-z0-9]|.|/|\\-)]+.[(jpg)|(bmp)|(gif)|(png)]”;//图片链接地址

7.         p_image = java.util.regex.Pattern.compile(regEx_img,java.util.regex.Pattern.CASE_INSENSITIVE);

8.         m_image = p_image.matcher(htmlStr);

9.         while(m_image.find()){

10.             img = img + “,” + m_image.group();

11.         }

12.         if(img.indexOf(“,”)>=0)

13.             return img.substring(1);

14.         else

15.             return img;

16.    }



方法一:



http://www.cnblogs.com/jintan/archive/2009/10/31/1593639.html



package com.cn;


import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;



public class img_src {


public   static   void   main(String[]   args)   {

String   html   =   “<html>\r\n”   +

“<head><title>test</title><head>\r\n”   +

“<body>”   +

“<P><IMG   height=\”100\”       src=’abc.png’   weight=\”30\”>abcdefg”   +

“<img   src=’http://abc.xyz.com/123/456.jpg’   /><br>”   +

“<IMG   height=\”100\”       \r\n”   +

”       src=\”abc.jpg\”   \r\n”   +

”   weight=\”30\”>abcdefg         \r\n”   +

”   <img   src=ttt.jpg>”   +

”   <img   src=123.jpg   />”   +

//                 “<img   alt=\”src=’abc’\”>”   +   //这种我也无能为力

“</body></html>”;

System.out.println(getImgSrc(html));

}

public   static   final   Pattern   PATTERN   =   Pattern.compile(“<img\\s+(?:[^>]*)src\\s*=\\s*([^>]+)”,   Pattern.CASE_INSENSITIVE   |   Pattern.MULTILINE);

public   static   List   getImgSrc(String   html)   {

Matcher   matcher   =   PATTERN.matcher(html);

List   list   =   new   ArrayList();

while   (matcher.find())   {

String   group   =   matcher.group(1);

if   (group   ==   null)   {

continue;

}

//   这里可能还需要更复杂的判断,用以处理src=”….”内的一些转义符

if   (group.startsWith(“‘”))   {

list.add(group.substring(1,   group.indexOf(“‘”,   1)));

}   else   if   (group.startsWith(“\””))   {

list.add(group.substring(1,   group.indexOf(“\””,   1)));

}   else   {

list.add(group.split(”


\\s”)[0


]);

}

}

return   list;

}


}



方法二:


package com.cn;


import java.util.regex.Matcher;

import java.util.regex.Pattern;


public class test {


public static void main(String[] args) {


String s = “&lt;IMG height=55 src=\”

http://www.gobygo.com/TheGoByGo/images/book-channel.gif\

” width=210 border=0 /&gt;”;

Pattern p1 = Pattern.compile(“&lt;IMG[\\w\\s\\d\\p{Punct}]*/&gt;”);

Matcher m = p1.matcher(s);

while (m.find()) {


String str = m.group();

Pattern p = Pattern.compile(“src=\”[\\w\\s\\d\\p{Punct}]*\””);

Matcher m1 = p.matcher(s);

while (m1.find()) {


String str1 = m1.group();

str = str1.substring(5, str1.length() – 1);

System.out.println(str);

}

}

}

}