rjc*_*rjc 7 screen-scraping jsoup
假设我有以下html:
<html>
<head>
</head>
<body>
<div id="wrapper" >
<div class="s2">I am going <a title="some title" href="">by flying</a>
<p>mr tt</p>
</div>
</div>
</body>
</html>
Run Code Online (Sandbox Code Playgroud)
文本节点中任何等于或大于4个字符的单词,例如单词"going"将替换<span>going<span>为原始html中的html内容(不是文本)而不更改任何其他内容.
如果我尝试做像element.html(替换)这样的事情,问题是如果让当前元素<div class="s2">也将擦除<a title="some title"
Mar*_*coS 12
在这种情况下,您必须按照此答案的建议遍历您的文档.以下是使用Jsoup API执行此操作的方法:
NodeTraversor并NodeVisitor允许您遍历DOMNode.replaceWith(...) 允许替换DOM中的节点这是代码:
public class JsoupReplacer {
public static void main(String[] args) {
so6527876();
}
public static void so6527876() {
String html =
"<html>" +
"<head>" +
"</head>" +
"<body>" +
" <div id=\"wrapper\" >" +
" <div class=\"s2\">I am going <a title=\"some title\" href=\"\">by flying</a>" +
" <p>mr tt</p>" +
" </div> " +
" </div>" +
"</body> " +
"</html>";
Document doc = Jsoup.parse(html);
final List<TextNode> nodesToChange = new ArrayList<TextNode>();
NodeTraversor nd = new NodeTraversor(new NodeVisitor() {
@Override
public void tail(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String text = textNode.getWholeText();
String[] words = text.trim().split(" ");
for (String word : words) {
if (word.length() > 4) {
nodesToChange.add(textNode);
break;
}
}
}
}
@Override
public void head(Node node, int depth) {
}
});
nd.traverse(doc.body());
for (TextNode textNode : nodesToChange) {
Node newNode = buildElementForText(textNode);
textNode.replaceWith(newNode);
}
System.out.println("result: ");
System.out.println();
System.out.println(doc);
}
private static Node buildElementForText(TextNode textNode) {
String text = textNode.getWholeText();
String[] words = text.trim().split(" ");
Set<String> longWords = new HashSet<String>();
for (String word : words) {
if (word.length() > 4) {
longWords.add(word);
}
}
String newText = text;
for (String longWord : longWords) {
newText = newText.replaceAll(longWord,
"<span>" + longWord + "</span>");
}
return new DataNode(newText, textNode.baseUri());
}
}
Run Code Online (Sandbox Code Playgroud)