当前位置:网站首页>Jsup supports XPath
Jsup supports XPath
2022-06-24 19:34:00 【God__ is__ a__ girl】
path It's professional xml Query language for structured documents , Powerful grammar , This article does not cover xpath Grammar course .
jsoup Is a Java Of HTML Parser , Can directly parse a URL Address 、HTML Text content . It provides a very labor-saving API, It can be done by DOM,CSS And similar to jQuery To extract and manipulate data , But when you select an element, there is still no xpath So simple and direct , and xpath With a lot of choice Libraries .
But unfortunately ,jsoup Does not support xpath, So the blogger wrote a let jsoup Supported by xpath Tool class of , I hope I can help my friends in need !
Tool class
package com.ry.mytools.util;
import com.sun.org.apache.xerces.internal.dom.ElementImpl;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.*;
import javax.xml.namespace.QName;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/** * * Jsoup Of xpath Parsing tool class * * * * @author liuhh * * * */
@SuppressWarnings("restriction")
public class JsoupParserUtil {
protected final static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private final static Logger log = LoggerFactory.getLogger(JsoupParserUtil.class);
private final static XPath xPath = XPathFactory.newInstance().newXPath();
protected static TransformerFactory tf = TransformerFactory.newInstance();
private static final Lock LOCK = new ReentrantLock();
/** * Get the number of child nodes of this node */
public static int getEleChildNum(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODESET);
if (null != res && res instanceof NodeList) {
NodeList nodeList = (NodeList) res;
return nodeList == null ? 0 : nodeList.getLength();
}
} catch (Exception e) {
log.error(" according to xpath:{}, Error getting the number of child nodes , The reason for the error :" + e.getMessage(), xpath);
}
return 0;
}
/** * Determine whether there is... In the document xpath node */
public static boolean exists(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.BOOLEAN);
if (null != res && res instanceof Boolean) {
return (boolean) res;
}
return false;
} catch (Exception e) {
log.error(" Check xpath:{}, There was an error while checking whether it exists ,!" + e.getMessage(), xpath);
}
return false;
}
/** * according to xpath obtain w3c Of Element object */
public static ElementImpl getW3cElementImpl(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODE);
if (null != res && res instanceof ElementImpl) {
return (ElementImpl) res;
}
return null;
} catch (Exception e) {
log.error(" according to xpath:{}, obtain w3c Of Element Object has an error , reason :" + e.getMessage(), xpath);
}
return null;
}
/** * according to xpath obtain jsoup Of Element object */
public static org.jsoup.nodes.Element getJsoupElement(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODE);
if (null != res && res instanceof ElementImpl) {
ElementImpl elementImpl = (ElementImpl) res;
return getJsoupEle(elementImpl);
}
return null;
} catch (Exception e) {
log.error(" according to xpath:{}, obtain jsoup Of Element Object has an error , reason :" + e.getMessage(), xpath);
}
return null;
}
/** * according to xpath obtain jsoup Of Elements object */
public static Elements getJsoupElements(final org.jsoup.nodes.Element ele, final String xpath) {
try {
NodeList nodeList = getNodeList(ele, xpath);
if (null != nodeList && nodeList.getLength() > 0) {
int len = nodeList.getLength();
Elements elements = new Elements();
for (int i = 0; i < len; i++) {
Node node = nodeList.item(i);
if (null != node && node instanceof ElementImpl) {
org.jsoup.nodes.Element
element = getJsoupEle(((ElementImpl) node));
elements.add(element);
}
}
return elements;
}
} catch (Exception e) {
log.error(" according to xpath:{}, obtain jsoup Of Element Object has an error , reason :" + e.getMessage(), xpath);
}
return null;
}
/** * from Jsoup Of Element Resolve in W3C Of NodeList */
public static NodeList getNodeList(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODESET);
if (null != res && res instanceof NodeList) {
return (NodeList) res;
}
} catch (Exception e) {
log.error(e.getMessage(), e);
}
return null;
}
/** * Get an attribute of the node */
public static String getXpathString(final org.jsoup.nodes.Element ele, final String xpath) {
try {
int textNum = getEleChildNum(ele, xpath);
if (1 == textNum) {
Object res = parse(ele, xpath, XPathConstants.STRING);
if (null != res) {
return res.toString();
}
} else {
List<String> res = getXpathListString(ele, xpath);
if (res != null && res.size() > 0) {
StringBuilder stringBuilder = new StringBuilder();
for (Iterator<String> iterator = res.iterator(); iterator.hasNext(); ) {
String text = iterator.next();
if (null != text) {
stringBuilder.append(text.replace("\r\n", "."));
}
}
return stringBuilder.toString();
}
}
return null;
} catch (Exception e) {
e.printStackTrace();
log.error(" according to xpath:{} Error querying string :" + e.getMessage(), xpath);
}
return null;
}
/** * Query string list */
public static List<String> getXpathListString(final org.jsoup.nodes.Element ele, final String xpath) {
try {
Object res = parse(ele, xpath, XPathConstants.NODESET);
if (null != res && res instanceof NodeList) {
NodeList nodeList = (NodeList) res;
int length = nodeList.getLength();
if (length <= 0) {
return null;
}
List<String> list = new ArrayList<>();
for (int i = 0; i < length; i++) {
Node node = nodeList.item(i);
list.add(null == node ? null : node.getNodeValue());
}
return list;
}
return null;
} catch (Exception e) {
log.error(" according to xpath:{} Error querying string list :" + e.getMessage(), xpath);
}
return null;
}
/** * obtain xpath Analysis results */
public static Object parse(final org.jsoup.nodes.Element doc, final String xPathStr, final QName qName) {
Node node = fromJsoup(doc);
return parse(node, xPathStr, qName);
}
public static Object parse(final Node doc, final String xPathStr, final QName qName) {
try {
if (doc == null) {
log.warn(" Parse document as null!");
return null;
}
if (StringUtils.isBlank(xPathStr)) {
log.warn(" Analytic Xpath The path is empty. !");
return null;
}
if (null == qName) {
log.warn(" The parsing type is null!");
return null;
}
try {
LOCK.lock();
Object res = xPath.evaluate(xPathStr, doc, qName);
return res;
} finally {
// TODO: handle finally clause
LOCK.unlock();
}
} catch (Exception e) {
log.warn(" analysis Xpath:{}, There is an error , Parsing type :{}, The reason for the error :{}!", xPathStr, qName, e.getMessage());
}
return null;
}
/** * according to ElementImpl obtain Jsoup Of Element */
public static org.jsoup.nodes.Element getJsoupEle(final ElementImpl elementImpl) {
try {
String value = getW3cDocString(elementImpl);
org.jsoup.nodes.Document document = Jsoup.parse(value);
return document.body().child(0);
} catch (Exception e) {
// TODO: handle exception
log.error(" according to ElementImpl obtain Jsoup Of Element There is an error , The reason for the error :" + e.getMessage());
return null;
}
}
/** * take w3c Of Document To jsoup Of Document */
public static org.jsoup.nodes.Document fromW3C(final Document doc) throws Exception {
String string = getW3cDocString(doc);
org.jsoup.nodes.Document res = Jsoup.parse(string);
return res;
}
/** * take jsoup Of Document To w3c Of Document */
public static Node fromJsoup(final org.jsoup.nodes.Element in) {
DocumentBuilder builder;
try {
if (null == in) {
return null;
}
builder = factory.newDocumentBuilder();
Document out = builder.newDocument();
if (in instanceof org.jsoup.nodes.Document) {
List<org.jsoup.nodes.Node> childs = in.childNodes();
if (childs != null && childs.size() > 0) {
org.jsoup.nodes.Element rootEl = in.child(0);
NodeTraversor
traversor = new NodeTraversor(new W3CBuilder(out));
traversor.traverse(rootEl);
return out;
} else {
// out.setNodeValue(in.);
return out;
}
} else if (in instanceof org.jsoup.nodes.Element) {
NodeTraversor
traversor = new NodeTraversor(new W3CBuilder(out));
traversor.traverse(in);
return out;
}
} catch (ParserConfigurationException e) {
return null;
}
return null;
}
/** * take W3c Of doc To string */
public static String getW3cDocString(final Node doc) throws Exception {
try (StringWriter writer = new StringWriter()) {
DOMSource domSource = new DOMSource(doc);
StreamResult result = new StreamResult(writer);
LOCK.lock();
try {
Transformer transformer = tf.newTransformer();
transformer.transform(domSource, result);
return writer.toString();
} finally {
LOCK.unlock();
}
} catch (TransformerException e) {
throw new IllegalStateException(e);
}
}
/** * take Jsoup Of node Properties are copied to w3c Of Element in */
public static void copyAttributes(final org.jsoup.nodes.Node source, final Element el) {
for (Attribute attribute : source.attributes()) {
el.setAttribute(attribute.getKey(), attribute.getValue());
}
}
}
class W3CBuilder implements NodeVisitor {
private final Document doc;
private Element dest;
public W3CBuilder(Document doc) {
this.doc = doc;
}
@Override
public void head(final org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element) {
org.jsoup.nodes.Element
sourceEl = (org.jsoup.nodes.Element) source;
Element el = doc.createElement(sourceEl.tagName());
JsoupParserUtil.copyAttributes(sourceEl, el);
if (dest == null) {
doc.appendChild(el);
} else {
dest.appendChild(el);
}
dest = el;
} else if (source instanceof org.jsoup.nodes.TextNode) {
org.jsoup.nodes.TextNode
sourceText = (org.jsoup.nodes.TextNode) source;
Text text = doc.createTextNode(sourceText.getWholeText());
dest.appendChild(text);
} else if (source instanceof org.jsoup.nodes.Comment) {
org.jsoup.nodes.Comment
sourceComment = (org.jsoup.nodes.Comment) source;
Comment comment = doc.createComment(sourceComment.getData());
dest.appendChild(comment);
} else if (source instanceof org.jsoup.nodes.DataNode) {
org.jsoup.nodes.DataNode
sourceData = (org.jsoup.nodes.DataNode) source;
Text node = doc.createTextNode(sourceData.getWholeData());
dest.appendChild(node);
} else {
}
}
@Override
public void tail(final org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
dest = (Element) dest.getParentNode();
}
}
}
test
import java.io.IOException;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class JsoupParserUtilsTest {
public static void main(String[] args) throws Exception, IOException {
String url = "http://mil.news.sina.com.cn/china/2016-09-29/doc-ifxwmamy9955666.shtml";
Document doc = Jsoup.parse(new URL(url), 10000);
String titleXpath = "//*[@id='main_title']/text()";
String timeXpath = "//*[@id='page-tools']/span/span[position() = 1]";
System.out.println(JsoupParserUtils.exists(doc, "/html/body/div[position>1000000]"));
System.out.println(JsoupParserUtils.getXpathString(doc, titleXpath));
Element element = JsoupParserUtils.getJsoupElement(doc, timeXpath);
System.out.println(element.text());
System.out.println(element.attr("class"));
}
}
————————————————
Let your Jsoup Support Xpath
边栏推荐
- 实时渲染:实时、离线、云渲染、混合渲染的区别
- 論文解讀(SR-GNN)《Shift-Robust GNNs: Overcoming the Limitations of Localized Graph Training Data》
- Starring develops httpjson access point + Database
- STM32 uses time delay to realize breathing lamp register version
- 优维低代码:构件渲染子构件
- Unity移动端游戏性能优化简谱之 以引擎模块为划分的CPU耗时调优
- Interprétation de la thèse (SR - gnn) Shift Robust GNNS: Overcoming the Limits of Localized Graph Training Data
- 全链路业务追踪落地实践方案
- Intel and Microsoft give full play to the potential energy of edge cloud collaboration to promote the large-scale deployment of AI
- How to select the ECS type and what to consider?
猜你喜欢

工作6年,月薪3W,1名PM的奋斗史

全链路业务追踪落地实践方案

Data backup and recovery of PgSQL

Application practice | massive data, second level analysis! Flink+doris build a real-time data warehouse scheme

Apifox与其他接口开发工具的博弈

Starring V6 platform development take out point process

Working for 6 years with a monthly salary of 3W and a history of striving for one PM

一文理解OpenStack网络

多云模式并非“万能钥匙”

Using alicloud RDS for SQL Server Performance insight to optimize database load - first understanding of performance insight
随机推荐
The agile way? Is agile development really out of date?
Ask a question. Adbhi supports the retention of 100 databases with the latest IDs. Is this an operation like this
Apifox与其他接口开发工具的博弈
【计算讲谈社】第三讲:如何提出关键问题?
Does version 2.2.0 support dynamic addition of MySQL synchronization tables
Pingcap was selected as the "voice of customers" of Gartner cloud database in 2022, and won the highest score of "outstanding performer"
Fabric ledger data block structure analysis (I): how to analyze the smart contract transaction data in the ledger
请问一下2.2.0版本支持动态新增mysql同步表吗
Introduction to smart contract security audit delegatecall (2)
一文理解OpenStack网络
怎么使用R包ggtreeExtra绘制进化树
Redis error: -bash: redis cli: command not found
Volcano becomes spark default batch scheduler
Necessary fault handling system for enterprise network administrator
Freeswitch uses origin to dialplan
Example analysis of corrplot related heat map beautification in R language
The script implements the automated deployment of raid0
小滴课堂海量数据处理商用短链平台大课
What are the functions of IBPs open source form designer?
Game between apifox and other interface development tools