mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
54 lines
1.4 KiB
Java
54 lines
1.4 KiB
Java
package nu.marginalia.dom;
|
|
|
|
import org.jsoup.nodes.Node;
|
|
import org.jsoup.nodes.TextNode;
|
|
import org.jsoup.select.NodeVisitor;
|
|
|
|
/** Best effort visitor to measure the length of the text in a DOM tree
|
|
* without allocating a bunch of Strings.
|
|
*/
|
|
public class MeasureLengthVisitor implements NodeVisitor {
|
|
public int length = 0;
|
|
|
|
@Override
|
|
public void head(Node node, int depth) {
|
|
if (node instanceof TextNode tn) {
|
|
length += lengthOfElement(tn);
|
|
}
|
|
}
|
|
|
|
// Emulate the HTML spec's definition of "length of an element"
|
|
// in a "close-enough" fashion.
|
|
static int lengthOfElement(TextNode tn) {
|
|
String wholeText = tn.getWholeText();
|
|
|
|
int length = 0;
|
|
|
|
int start = 0;
|
|
int end = wholeText.length() - 1;
|
|
|
|
while (start < wholeText.length() && Character.isWhitespace(wholeText.charAt(start)))
|
|
start++;
|
|
while (end >= 0 && Character.isWhitespace(wholeText.charAt(end)))
|
|
end--;
|
|
|
|
boolean lastWasWhitespace = false;
|
|
for (int i = start; i < end; i++) {
|
|
char c = wholeText.charAt(i);
|
|
if (Character.isWhitespace(c)) {
|
|
if (!lastWasWhitespace) {
|
|
length++;
|
|
}
|
|
|
|
lastWasWhitespace = true;
|
|
} else {
|
|
length++;
|
|
|
|
lastWasWhitespace = false;
|
|
}
|
|
}
|
|
|
|
return length;
|
|
}
|
|
}
|