当前位置:网站首页>Jsup supports XPath

Jsup supports XPath

2022-06-24 19:34:00 God__ is__ a__ girl

path It's professional xml Query language for structured documents , Powerful grammar , This article does not cover xpath Grammar course .

jsoup Is a Java Of HTML Parser , Can directly parse a URL Address 、HTML Text content . It provides a very labor-saving API, It can be done by DOM,CSS And similar to jQuery To extract and manipulate data , But when you select an element, there is still no xpath So simple and direct , and xpath With a lot of choice Libraries .

But unfortunately ,jsoup Does not support xpath, So the blogger wrote a let jsoup Supported by xpath Tool class of , I hope I can help my friends in need !

Tool class

package com.ry.mytools.util;

import com.sun.org.apache.xerces.internal.dom.ElementImpl;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.*;

import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

/** * * Jsoup Of xpath Parsing tool class  * * * * @author liuhh * * * */
@SuppressWarnings("restriction")
public class JsoupParserUtil {
    
    protected final static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    private final static Logger log = LoggerFactory.getLogger(JsoupParserUtil.class);
    private final static XPath xPath = XPathFactory.newInstance().newXPath();
    protected static TransformerFactory tf = TransformerFactory.newInstance();
    private static final Lock LOCK = new ReentrantLock();

    /** *  Get the number of child nodes of this node  */
    public static int getEleChildNum(final org.jsoup.nodes.Element ele, final String xpath) {
    
        try {
    
            Object res = parse(ele, xpath, XPathConstants.NODESET);
            if (null != res && res instanceof NodeList) {
    
                NodeList nodeList = (NodeList) res;
                return nodeList == null ? 0 : nodeList.getLength();

            }
        } catch (Exception e) {
    
            log.error(" according to xpath:{}, Error getting the number of child nodes , The reason for the error :" + e.getMessage(), xpath);

        }
        return 0;
    }

    /** *  Determine whether there is... In the document xpath node  */
    public static boolean exists(final org.jsoup.nodes.Element ele, final String xpath) {
    
        try {
    
            Object res = parse(ele, xpath, XPathConstants.BOOLEAN);
            if (null != res && res instanceof Boolean) {
    
                return (boolean) res;
            }
            return false;
        } catch (Exception e) {
    
            log.error(" Check xpath:{}, There was an error while checking whether it exists ,!" + e.getMessage(), xpath);
        }
        return false;
    }

    /** *  according to xpath obtain w3c Of Element object  */
    public static ElementImpl getW3cElementImpl(final org.jsoup.nodes.Element ele, final String xpath) {
    
        try {
    
            Object res = parse(ele, xpath, XPathConstants.NODE);
            if (null != res && res instanceof ElementImpl) {
    
                return (ElementImpl) res;
            }
            return null;
        } catch (Exception e) {
    
            log.error(" according to xpath:{}, obtain w3c Of Element Object has an error , reason :" + e.getMessage(), xpath);
        }
        return null;
    }

    /** *  according to xpath obtain jsoup Of Element object  */
    public static org.jsoup.nodes.Element getJsoupElement(final org.jsoup.nodes.Element ele, final String xpath) {
    
        try {
    
            Object res = parse(ele, xpath, XPathConstants.NODE);
            if (null != res && res instanceof ElementImpl) {
    
                ElementImpl elementImpl = (ElementImpl) res;
                return getJsoupEle(elementImpl);
            }
            return null;
        } catch (Exception e) {
    
            log.error(" according to xpath:{}, obtain jsoup Of Element Object has an error , reason :" + e.getMessage(), xpath);
        }
        return null;
    }

    /** *  according to xpath obtain jsoup Of Elements object  */
    public static Elements getJsoupElements(final org.jsoup.nodes.Element ele, final String xpath) {
    
        try {
    
            NodeList nodeList = getNodeList(ele, xpath);
            if (null != nodeList && nodeList.getLength() > 0) {
    
                int len = nodeList.getLength();
                Elements elements = new Elements();
                for (int i = 0; i < len; i++) {
    
                    Node node = nodeList.item(i);
                    if (null != node && node instanceof ElementImpl) {
    
                        org.jsoup.nodes.Element
                                element = getJsoupEle(((ElementImpl) node));
                        elements.add(element);
                    }
                }
                return elements;
            }

        } catch (Exception e) {
    
            log.error(" according to xpath:{}, obtain jsoup Of Element Object has an error , reason :" + e.getMessage(), xpath);
        }
        return null;
    }

    /** *  from Jsoup Of Element Resolve in W3C Of NodeList */
    public static NodeList getNodeList(final org.jsoup.nodes.Element ele, final String xpath) {
    
        try {
    
            Object res = parse(ele, xpath, XPathConstants.NODESET);
            if (null != res && res instanceof NodeList) {
    
                return (NodeList) res;
            }
        } catch (Exception e) {
    
            log.error(e.getMessage(), e);
        }
        return null;
    }

    /** *  Get an attribute of the node  */
    public static String getXpathString(final org.jsoup.nodes.Element ele, final String xpath) {
    
        try {
    
            int textNum = getEleChildNum(ele, xpath);
            if (1 == textNum) {
    
                Object res = parse(ele, xpath, XPathConstants.STRING);
                if (null != res) {
    
                    return res.toString();
                }
            } else {
    
                List<String> res = getXpathListString(ele, xpath);
                if (res != null && res.size() > 0) {
    
                    StringBuilder stringBuilder = new StringBuilder();
                    for (Iterator<String> iterator = res.iterator(); iterator.hasNext(); ) {
    
                        String text = iterator.next();
                        if (null != text) {
    
                            stringBuilder.append(text.replace("\r\n", "."));
                        }
                    }
                    return stringBuilder.toString();
                }
            }
            return null;
        } catch (Exception e) {
    
            e.printStackTrace();
            log.error(" according to xpath:{} Error querying string :" + e.getMessage(), xpath);
        }
        return null;
    }

    /** *  Query string list  */
    public static List<String> getXpathListString(final org.jsoup.nodes.Element ele, final String xpath) {
    
        try {
    
            Object res = parse(ele, xpath, XPathConstants.NODESET);
            if (null != res && res instanceof NodeList) {
    
                NodeList nodeList = (NodeList) res;
                int length = nodeList.getLength();
                if (length <= 0) {
    
                    return null;
                }
                List<String> list = new ArrayList<>();
                for (int i = 0; i < length; i++) {
    
                    Node node = nodeList.item(i);
                    list.add(null == node ? null : node.getNodeValue());
                }
                return list;
            }
            return null;
        } catch (Exception e) {
    
            log.error(" according to xpath:{} Error querying string list :" + e.getMessage(), xpath);
        }
        return null;
    }

    /** *  obtain xpath Analysis results  */
    public static Object parse(final org.jsoup.nodes.Element doc, final String xPathStr, final QName qName) {
    
        Node node = fromJsoup(doc);
        return parse(node, xPathStr, qName);
    }

    public static Object parse(final Node doc, final String xPathStr, final QName qName) {
    
        try {
    
            if (doc == null) {
    
                log.warn(" Parse document as null!");
                return null;
            }
            if (StringUtils.isBlank(xPathStr)) {
    
                log.warn(" Analytic Xpath The path is empty. !");
                return null;
            }
            if (null == qName) {
    
                log.warn(" The parsing type is null!");
                return null;
            }
            try {
    
                LOCK.lock();
                Object res = xPath.evaluate(xPathStr, doc, qName);
                return res;
            } finally {
    
                // TODO: handle finally clause
                LOCK.unlock();
            }
        } catch (Exception e) {
    
            log.warn(" analysis Xpath:{}, There is an error , Parsing type :{}, The reason for the error :{}!", xPathStr, qName, e.getMessage());
        }
        return null;
    }

    /** *  according to ElementImpl obtain Jsoup Of Element */
    public static org.jsoup.nodes.Element getJsoupEle(final ElementImpl elementImpl) {
    
        try {
    
            String value = getW3cDocString(elementImpl);
            org.jsoup.nodes.Document document = Jsoup.parse(value);
            return document.body().child(0);

        } catch (Exception e) {
    
            // TODO: handle exception
            log.error(" according to ElementImpl obtain Jsoup Of Element There is an error , The reason for the error :" + e.getMessage());
            return null;

        }
    }

    /** *  take w3c Of Document To jsoup Of Document */
    public static org.jsoup.nodes.Document fromW3C(final Document doc) throws Exception {
    
        String string = getW3cDocString(doc);
        org.jsoup.nodes.Document res = Jsoup.parse(string);
        return res;
    }

    /** *  take jsoup Of Document To w3c Of Document */
    public static Node fromJsoup(final org.jsoup.nodes.Element in) {
    
        DocumentBuilder builder;
        try {
    
            if (null == in) {
    
                return null;
            }
            builder = factory.newDocumentBuilder();
            Document out = builder.newDocument();
            if (in instanceof org.jsoup.nodes.Document) {
    
                List<org.jsoup.nodes.Node> childs = in.childNodes();
                if (childs != null && childs.size() > 0) {
    
                    org.jsoup.nodes.Element rootEl = in.child(0);
                    NodeTraversor
                            traversor = new NodeTraversor(new W3CBuilder(out));
                    traversor.traverse(rootEl);
                    return out;
                } else {
    
                    // out.setNodeValue(in.);
                    return out;
                }
            } else if (in instanceof org.jsoup.nodes.Element) {
    
                NodeTraversor
                        traversor = new NodeTraversor(new W3CBuilder(out));
                traversor.traverse(in);
                return out;
            }
        } catch (ParserConfigurationException e) {
    
            return null;
        }
        return null;
    }

    /** *  take W3c Of doc To string  */
    public static String getW3cDocString(final Node doc) throws Exception {
    
        try (StringWriter writer = new StringWriter()) {
    
            DOMSource domSource = new DOMSource(doc);
            StreamResult result = new StreamResult(writer);
            LOCK.lock();
            try {
    
                Transformer transformer = tf.newTransformer();
                transformer.transform(domSource, result);
                return writer.toString();

            } finally {
    
                LOCK.unlock();
            }
        } catch (TransformerException e) {
    
            throw new IllegalStateException(e);
        }
    }

    /** *  take Jsoup Of node Properties are copied to w3c Of Element in  */
    public static void copyAttributes(final org.jsoup.nodes.Node source, final Element el) {
    
        for (Attribute attribute : source.attributes()) {
    
            el.setAttribute(attribute.getKey(), attribute.getValue());
        }
    }
}

class W3CBuilder implements NodeVisitor {
    
    private final Document doc;
    private Element dest;

    public W3CBuilder(Document doc) {
    
        this.doc = doc;
    }

    @Override
    public void head(final org.jsoup.nodes.Node source, int depth) {
    
        if (source instanceof org.jsoup.nodes.Element) {
    
            org.jsoup.nodes.Element
                    sourceEl = (org.jsoup.nodes.Element) source;
            Element el = doc.createElement(sourceEl.tagName());
            JsoupParserUtil.copyAttributes(sourceEl, el);
            if (dest == null) {
    
                doc.appendChild(el);
            } else {
    
                dest.appendChild(el);
            }
            dest = el;

        } else if (source instanceof org.jsoup.nodes.TextNode) {
    
            org.jsoup.nodes.TextNode
                    sourceText = (org.jsoup.nodes.TextNode) source;
            Text text = doc.createTextNode(sourceText.getWholeText());
            dest.appendChild(text);

        } else if (source instanceof org.jsoup.nodes.Comment) {
    
            org.jsoup.nodes.Comment
                    sourceComment = (org.jsoup.nodes.Comment) source;
            Comment comment = doc.createComment(sourceComment.getData());
            dest.appendChild(comment);

        } else if (source instanceof org.jsoup.nodes.DataNode) {
    
            org.jsoup.nodes.DataNode
                    sourceData = (org.jsoup.nodes.DataNode) source;
            Text node = doc.createTextNode(sourceData.getWholeData());
            dest.appendChild(node);

        } else {
    

        }
    }

    @Override
    public void tail(final org.jsoup.nodes.Node source, int depth) {
    
        if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
    
            dest = (Element) dest.getParentNode();
        }
    }
}

test

import java.io.IOException;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
 
public class JsoupParserUtilsTest {
    
	
	public static void main(String[] args) throws Exception, IOException {
    
		String url = "http://mil.news.sina.com.cn/china/2016-09-29/doc-ifxwmamy9955666.shtml";
		Document doc = Jsoup.parse(new URL(url), 10000);
		String titleXpath = "//*[@id='main_title']/text()";
		String timeXpath = "//*[@id='page-tools']/span/span[position() = 1]";
		System.out.println(JsoupParserUtils.exists(doc, "/html/body/div[position>1000000]"));
		System.out.println(JsoupParserUtils.getXpathString(doc, titleXpath));
		Element element = JsoupParserUtils.getJsoupElement(doc, timeXpath);
		System.out.println(element.text());
		System.out.println(element.attr("class"));
	}
}

————————————————
Let your Jsoup Support Xpath

原网站

版权声明
本文为[God__ is__ a__ girl]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/02/202202211330506757.html