1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00
Files
MarginaliaSearch/code/functions/language-processing/java/nu/marginalia/dom/MeasureLengthVisitor.java
Viktor Lofgren c661ebb619 (refac) Move language-processing into functions
It's long surpassed the single-responsibility library it once was, and is as such out of place in its original location, and fits better among the function-type modules.
2025-09-18 10:30:40 +02:00

54 lines
1.4 KiB
Java

package nu.marginalia.dom;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;
/** Best effort visitor to measure the length of the text in a DOM tree
* without allocating a bunch of Strings.
*/
public class MeasureLengthVisitor implements NodeVisitor {
public int length = 0;
@Override
public void head(Node node, int depth) {
if (node instanceof TextNode tn) {
length += lengthOfElement(tn);
}
}
// Emulate the HTML spec's definition of "length of an element"
// in a "close-enough" fashion.
static int lengthOfElement(TextNode tn) {
String wholeText = tn.getWholeText();
int length = 0;
int start = 0;
int end = wholeText.length() - 1;
while (start < wholeText.length() && Character.isWhitespace(wholeText.charAt(start)))
start++;
while (end >= 0 && Character.isWhitespace(wholeText.charAt(end)))
end--;
boolean lastWasWhitespace = false;
for (int i = start; i < end; i++) {
char c = wholeText.charAt(i);
if (Character.isWhitespace(c)) {
if (!lastWasWhitespace) {
length++;
}
lastWasWhitespace = true;
} else {
length++;
lastWasWhitespace = false;
}
}
return length;
}
}