这里我使用SAX Parser实现解析,它提取标题元素和标题
维基百科转储文件中的重定向元素中的
属性。
package parser;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class SAXHandler extends DefaultHandler {
List list;
int count=0,counter=0;
int MAX_SIZE=100000;
String temp=””;
int counterz=0;
public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException{
long start = System.currentTimeMillis();
SAXHandler saxhandler=new SAXHandler();
saxhandler.assign();
saxhandler.parseDoc();
long end = System.currentTimeMillis();
System.out.println(“Time taken to write is ” + (end – start) + “msecs”);
}
void assign(){
list = new ArrayList();
}
void parseDoc() throws ParserConfigurationException, SAXException, IOException{
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser sp = spf.newSAXParser();
sp.parse(“D:\\XMLParsing_Files\\enwiki-20120902-pages-articles-multistream.xml”, this);
writeToFile(list); // for writing the end elements
}
public void startDocument() throws SAXException {
}
public void endDocument() throws SAXException {
}
public void startElement(String uri, String localName,String qName, Attributes attributes)throws SAXException {
if(qName.equalsIgnoreCase(“redirect”))
{
list.add(attributes.getValue(“title”));
count++;
if(count==MAX_SIZE)
{
try {
writeToFile(list);
} catch (IOException e) {
e.printStackTrace();
}
list.clear();
count=0;
}
}
}
public void endElement(String uri, String localName, String qName)throws SAXException {
if(qName.equalsIgnoreCase(“title”))
{
list.add(temp);
count++;
if(count==MAX_SIZE)
{
try {
writeToFile(list);
} catch (IOException e) {
e.printStackTrace();
}
list.clear();
count=0;
}
}
}
public void characters(char ch[], int start, int length)throws SAXException {
temp=””;
temp=new String(ch,start,length);
}
void writeToFile(List list) throws IOException{
Collections.sort(list);
File file = new File(“D:\\XMLParsing_Files\\Extracted_Data\\Extracted_Sorted_Data_” + getSuffix() + “.txt”);
if (!file.exists()) {
file.createNewFile();
}
FileWriter fw = new FileWriter(file.getAbsoluteFile());
PrintWriter pw = new PrintWriter(fw);
Iterator it = list.iterator();
while (it.hasNext()) {
pw.println(it.next());
}
pw.println(“zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz”);
pw.close();
System.out.println(++counterz + “Done”);
}
int getSuffix(){
counter++;
return counter;
}
}