JAVA实现格式化XML的压缩,去除了间距、缩进、换行,使其缩小、压缩。

  • Post author:
  • Post category:java




刚开始使用在线XML格式化网站的时候就想写一个一样功能的java程序出来。

如:

在线XML格式化


或:

在线XML格式化2



刚开始的设想是:使用XML解析库(如DOM)解析XML并序列化得到的结果,主要使用Transformer

代码如下:

package com.example;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;

import org.xml.sax.InputSource;


/**
     * 将格式化的XML压缩成无空格、换行的XML。
 */


public class XMLCompress {
    public static void main(String[] args) throws Exception {
        String formattedXml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" +
                "<Root>\n" +
                "    <Head>\n" +
                "        <enterid>aaaaaa</enterid>\n" +
                "        <txdate>20230710</txdate>\n" +
                "        <seqno>20230711105728361668</seqno>\n" +
                "        <inflag>BM</inflag>\n" +
                "        <termno>1</termno>\n" +
                "        <signtxt></signtxt>\n" +
                "    </Head>\n" +
                "    <Body>\n" +
                "        <acctname1>测试华</acctname1>\n" +
                "        <acctno1>6200000000000003642</acctno1>\n" +
                "        <curry>01</curry>\n" +
                "        <txamt>1.00</txamt>\n" +
                "        <busitype>2</busitype>\n" +
                "        <remark>备注:(主体和转入帐号、户名不能为空)</remark>\n" +
                "        <accttype1>bm</accttype1>\n" +
                "        <accttype2>hzz</accttype2>\n" +
                "        <acctname2>边民互助组一</acctname2>\n" +
                "        <acctno2>172612010105613788</acctno2>\n" +
                "    </Body>\n" +
                "</Root>";
//        System.out.println("原:"+formattedXml);
//        System.out.println("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");
            String compactXml = compactXml(formattedXml);
        System.out.println("解析:\n" + compactXml);
        System.out.println("位数:" + compactXml.getBytes(StandardCharsets.UTF_8).length);
        System.out.println("---------------------------------------------------------------------------------------------------------------------------------------------");
        String test="<?xml version=\"1.0\" encoding=\"utf-8\"?><Root><Head><enterid>aaaaaa</enterid><txdate>20230710</txdate><seqno>20230711105728361668</seqno><inflag>BM</inflag><termno>1</termno><signtxt></signtxt></Head><Body><acctname1>测试华</acctname1><acctno1>6200000000000003642</acctno1><curry>01</curry><txamt>1.00</txamt><busitype>2</busitype><remark>备注:(主体和转入帐号、户名不能为空)</remark><accttype1>bm</accttype1><accttype2>hzz</accttype2><acctname2>边民互助组一</acctname2><acctno2>172612010105613788</acctno2></Body></Root>";
        System.out.println("真压缩:\n"+test);
        System.out.println(test.getBytes(StandardCharsets.UTF_8).length);
        System.out.println("比较:"+(test.getBytes(StandardCharsets.UTF_8).length == compactXml.getBytes(StandardCharsets.UTF_8).length));
//        System.out.println("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");
//        String test="<?xml version=\"1.0\" encoding=\"utf-8\"?><Root><Head><enterid>aaaaaa</enterid><txdate>20230626</txdate><seqno>20230626000000000008</seqno><inflag>BM</inflag><termno>1</termno><signtxt>XDdwTo4r/n9bGNyYVOT9B/2n6vQQWyUdAbETGJWVjdjap75ayM4bjIbzBQC7/8KVR2vdQmpJdYkSliGg2fx9pP9Lagp9EzN9xXUvhym5/TllTSTPJAoBF7e0yQXaDkTgzo5m/pZOqzgunaCaf/Eje9ipyjvVjjMF3RPndHhZ7BM=</signtxt></Head><Body><acctname1>测试华</acctname1><acctno1>6200000000000003642</acctno1><curry>01</curry><txamt>1.00</txamt><busitype>2</busitype><remark>备注:</remark><accttype1>bm</accttype1><accttype2>hzz</accttype2><acctname2>边民互助组一</acctname2><acctno2>172612010105613788</acctno2></Body></Root>";
//        System.out.println("真:"+test);
//        System.out.println("比较:"+compactXml.equals(test));
    }

    public static String compactXml(String xmlStr) throws Exception {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = factory.newDocumentBuilder();
        Document doc = builder.parse(new InputSource(new StringReader(xmlStr)));
        removeWhitespaceNodes(doc.getDocumentElement());

        TransformerFactory tfactory = TransformerFactory.newInstance();
        Transformer transformer = tfactory.newTransformer();
        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
        transformer.setOutputProperty(OutputKeys.INDENT, "no");
        StringWriter writer = new StringWriter();
        transformer.transform(new DOMSource(doc), new StreamResult(writer));

        // Manually add the XML declaration, so it doesn't include the standalone attribute
        return "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + writer.toString();
    }

    private static void removeWhitespaceNodes(Node node) {
        NodeList children = node.getChildNodes();
        for (int i = children.getLength() - 1; i >= 0; i--) {
            Node child = children.item(i);
            if (child.getNodeType() == Node.TEXT_NODE && !child.getNodeValue().equals(" ") && child.getNodeValue().trim().isEmpty()) {
                node.removeChild(child);
            } else if (child.getNodeType() == Node.ELEMENT_NODE) {
                removeWhitespaceNodes(child);
            }
        }
    }
}

​ 但实际运行后和在线XML格式化网站的结果对比发现有问题;程序存在错误,因为使用XML解析库(如DOM)解析XML并序列化得到的结果.所以在压缩时会压缩成。导致原XML内容长度位数发生改变。

将压缩成了。这是因为我们使用了Transformer来序列化XML,而Transformer默认会将空元素转换为自闭合标签。

查阅后发现根据XML语法,和是等价的,它们都表示一个空的signtxt元素。所以,当使用XML解析库解析并序列化XML时,会自动转换为。这是正常的,并符合XML规范。

上述问题解决了很久解决不了,因为是XML规范导致的错误。所以就采用了下面的其他方法



方法二:使用replaceAll直接替换空格和换行。

后面冷静下来才发现,str.replaceAll(“\n”, “”);replaceAll(” “, “”);直接替换就好了,再加上一个从XML声明的”>”后开始替换空格,一共就三行代码,,,,,,

原来那个XML解析库(如DOM)解析XML,根本不用这么复杂。

详细代码如下:

package com.example;



import java.nio.charset.StandardCharsets;


/**
 * 将格式化的XML压缩成无空格、换行的XML。
 */

public class XMLCompress {
    public static String xmlCompress(String str){


        String compressedString = str.replaceAll("\n", "");
        int startIndex = compressedString.indexOf(">") + 1;
        compressedString = compressedString.substring(0, startIndex) + compressedString.substring(startIndex).replaceAll(" ", "");


        System.out.println("解析:\n" + compressedString);
        System.out.println("位数:" + compressedString.getBytes(StandardCharsets.UTF_8).length);

        System.out.println("---------------------------------------------------------------------------------------------------------------------------------------------");
         String test="<?xml version=\"1.0\" encoding=\"utf-8\"?><Root><Head><enterid>aaaaaa</enterid><txdate>20230710</txdate><seqno>20230711105728361668</seqno><inflag>BM</inflag><termno>1</termno><signtxt></signtxt></Head><Body><acctname1>测试华</acctname1><acctno1>6200000000000003642</acctno1><curry>01</curry><txamt>1.00</txamt><busitype>2</busitype><remark>备注:(主体和转入帐号、户名不能为空)</remark><accttype1>bm</accttype1><accttype2>hzz</accttype2><acctname2>边民互助组一</acctname2><acctno2>172612010105613788</acctno2></Body></Root>";
        System.out.println("真压缩:\n"+test);
        System.out.println(test.getBytes(StandardCharsets.UTF_8).length);
        System.out.println("比较:"+(test.getBytes(StandardCharsets.UTF_8).length == compressedString.getBytes(StandardCharsets.UTF_8).length));
        return compressedString;
    }

    public static void main(String[] args) throws Exception {
        String xmlString = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" +
                "<Root>\n" +
                "    <Head>\n" +
                "        <enterid>NPJPZH</enterid>\n" +
                "        <txdate>20230710</txdate>\n" +
                "        <seqno>20230711105728361668</seqno>\n" +
                "        <inflag>BM</inflag>\n" +
                "        <termno>1</termno>\n" +
                "        <signtxt></signtxt>\n" +
                "    </Head>\n" +
                "    <Body>\n" +
                "        <acctname1>测试华</acctname1>\n" +
                "        <acctno1>6229920200000003642</acctno1>\n" +
                "        <curry>01</curry>\n" +
                "        <txamt>1.00</txamt>\n" +
                "        <busitype>2</busitype>\n" +
                "        <remark>备注:(主体和转入帐号、户名不能为空)</remark>\n" +
                "        <accttype1>bm</accttype1>\n" +
                "        <accttype2>hzz</accttype2>\n" +
                "        <acctname2>边民互助组一</acctname2>\n" +
                "        <acctno2>172612010105613788</acctno2>\n" +
                "    </Body>\n" +
                "</Root>";

        xmlCompress(xmlString);

    }
}



版权声明:本文为MAPLE__f原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。