如何快速统计Word文档字符数，包括自动编号的？

2026-04-16 16:374阅读0评论SEO问题

内容介绍
相关推荐

本文共计1308个文字，预计阅读时间需要6分钟。

Maven依赖配置如下：

xml org.apache.poi poi-scratchpad 3.14 org.apache.commons commons-lang3 3.3.2

maven依赖

org.apache.poi poi-scratchpad 3.14 org.apache.commons commons-lang3 3.3.2 doc文档字符计数

import org.apache.commons.lang3.ArrayUtils; import org.apache.poi.hwpf.extractor.WordExtractor; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; /** * Created by chunsw@aliyun.com on 16/5/23. */ public class CountDoc { static int[] wordCountNew(String doc, boolean isDebug) throws Exception { long time = System.currentTimeMillis(); InputStream is = new FileInputStream(new File(doc)); WordExtractor ex = new WordExtractor(is); int cnt = 0; StringBuilder builder = new StringBuilder(); for (String text : ex.getParagraphText()) { // text = text.replaceAll("\u0007", "").replaceAll("\f", "") // .replaceAll("\r", "").replaceAll("\n", "") // .replaceAll("\u0015", ""); if (isDebug) { text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015' }); } else { text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015', '\r', '\n' }); } String prefix = " TOC \\o \\u \u0014"; if (text.startsWith(prefix)) text = text.substring(prefix.length()); // flag = "\u0013 EMBED Visio.Drawing.11 \u0014\u0001"; // flag = "\u0013 EMBED Word.Document.12 \\s \u0014\u0001"; int start = text.indexOf("\u0013"); int end = text.indexOf("\u0014\u0001"); if (start >= 0 && end > start) { text = text.replaceAll("\u0013[^\u0014\u0001]+\u0014\u0001", ""); } text = text.replaceAll("\u0013[^\u0014\u0013]+\u0014", ""); String flag = "\u0013 HYPERLINK"; int pos = text.indexOf(flag); if (pos >= 0) { String[] arr = text.split(" \u0014"); text = text.substring(0, pos) + arr[1]; } if (text.length() >= 767) { // word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入 // if (text.replaceAll(" ", "").length() < text.length() - 767) { // text = text.replaceAll(" {767,}", ""); // } } if (isDebug) builder.append(text); cnt += text.length(); } int t = Long.valueOf(System.currentTimeMillis() - time).intValue(); if (isDebug) { System.out.println(builder.toString()); // .replaceAll("\r", "").replaceAll("\n", "") System.out.println(cnt); System.out.println(t + " ms"); } return new int[] { cnt, t }; } private static String trimAllChars(String text, char[] chars) { if (text == null || text.isEmpty()) return text; StringBuilder builder = new StringBuilder(); for (int i = 0; i < text.length(); i++) { if (!ArrayUtils.contains(chars, text.charAt(i))) builder.append(text.charAt(i)); } return builder.toString(); } } docx文档字符计数

package com.github.wangshichun.util.word.count; import org.apache.commons.lang3.StringUtils; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; import java.io.*; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; /** * Created by chunsw@aliyun.com on 16/5/23. */ public class CountDocx { static int[] wordCountNew(Object xmlSource, boolean isDebug) throws Exception { long time = System.currentTimeMillis(); XMLReader parser = XMLReaderFactory.createXMLReader(); final Integer[] cnt = {0}; final Integer[] sectPrCount = {0}; final Integer[] brCount = {0}; final Integer[] numPrCount = {0}; Map numIdMap = new HashMap (); StringBuilder stringBuilder2 = new StringBuilder(); if ((xmlSource instanceof String && xmlSource.toString().endsWith(".docx")) || xmlSource instanceof InputStream) { // System.out.println("in zip file"); ZipInputStream zipInputStream = new ZipInputStream(xmlSource instanceof InputStream ? (InputStream) xmlSource : new FileInputStream((String) xmlSource)); NoCloseInputStream noCloseInputStream = new NoCloseInputStream(new BufferedInputStream(zipInputStream)); ZipEntry zipEntry; while ((zipEntry = zipInputStream.getNextEntry()) != null) { // 项目符号和编号的格式定义(例如: 多级列表的一级为` ` 或 `%1.`, 二级为`%1.%2`)在"word/numbering.xml"中, 暂不处理 if ("word/document.xml".equals(zipEntry.getName())) { parser.setContentHandler(new DocumentXMLHandler(cnt, sectPrCount, brCount, numPrCount, numIdMap, stringBuilder2, isDebug)); parser.parse(new InputSource(noCloseInputStream)); } if ("word/endnotes.xml".equals(zipEntry.getName())) { parser.setContentHandler(new EndNotesXMLHandler(cnt, stringBuilder2, isDebug)); parser.parse(new InputSource(noCloseInputStream)); } } noCloseInputStream.doClose(); zipInputStream.close(); } else { parser.setContentHandler(new DocumentXMLHandler(cnt, sectPrCount, brCount, numPrCount, numIdMap, stringBuilder2, isDebug)); parser.parse(xmlSource.toString()); } int seqCnt = 0; for (AtomicInteger temp : numIdMap.values()) { if (temp.get() < 10) continue; if (temp.get() < 100) { seqCnt = seqCnt + temp.get() - 9; } else if (temp.get() < 1000) { seqCnt += 90; seqCnt += (temp.get() - 99) * 2; } else { seqCnt += 1890; seqCnt += (temp.get() - 999) * 3; } } cnt[0] += numPrCount[0]; int len = cnt[0] - sectPrCount[0] + 1 + brCount[0] + seqCnt; int t = Long.valueOf(System.currentTimeMillis() - time).intValue(); if (isDebug) { System.out.println(stringBuilder2); System.out.println(len); System.out.println(t + " ms"); } return new int[]{len, t}; } static class NoCloseInputStream extends FilterInputStream { public NoCloseInputStream(InputStream is) { super(is); } public void close() throws IOException { } public void doClose() throws IOException { super.close(); } } static class EndNotesXMLHandler extends DefaultHandler { private boolean inTextElement = false; private Integer[] cnt; private StringBuilder stringBuilder2; private boolean isDebug; EndNotesXMLHandler(Integer[] cnt, StringBuilder stringBuilder2, boolean isDebug) { this.cnt = cnt; this.stringBuilder2 = stringBuilder2; this.isDebug = isDebug; } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { // Using qualified name because we are not using xmlns prefixes here. if (qName.equals("w:t")) { inTextElement = true; } } @Override public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if (qName.equals("w:t")) { inTextElement = false; } } @Override public void characters(char[] ch, int start, int length) { if (!inTextElement) return; cnt[0] += length; if (isDebug) { String content = new String(ch, start, length); stringBuilder2.append(content); } } } static class DocumentXMLHandler extends DefaultHandler { private boolean inTabs = false; private boolean inPPr = false; private boolean inNumPr = false; private boolean inTextElement = false; private boolean hasPStyle = false; private Integer[] cnt; private Integer[] sectPrCount; private Integer[] brCount; private Integer[] numPrCount; private Integer ilvl = null; private Map numIdMap; private StringBuilder stringBuilder2; private boolean isDebug; private boolean inPicture = false; private Integer pStyle = null; DocumentXMLHandler(Integer[] cnt, Integer[] sectPrCount, Integer[] brCount, Integer[] numPrCount, Map numIdMap, StringBuilder stringBuilder2, boolean isDebug) { this.cnt = cnt; this.sectPrCount = sectPrCount; this.brCount = brCount; this.numPrCount = numPrCount; this.numIdMap = numIdMap; this.stringBuilder2 = stringBuilder2; this.isDebug = isDebug; numIdMap.clear(); } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { // Using qualified name because we are not using xmlns prefixes here. if (qName.equals("w:tabs")) { inTabs = true; } else if (qName.equals("w:tab")) { if (!inTabs) cnt[0]++; } else if (qName.equals("w:sectPr")) { sectPrCount[0]++; } else if (qName.equals("w:br")) { if (atts.getLength() == 0) brCount[0]++; } else if (qName.equals("w:t")) { inTextElement = true; } else if (qName.equals("w:pPr")) { inPPr = true; } else if (qName.equals("w:pStyle")) { String val = atts.getValue("w:val"); if (StringUtils.isNumeric(val)) { pStyle = Integer.valueOf(val); hasPStyle = true; } } else if (qName.equals("w:numPr")) { inNumPr = true; } else if (qName.equals("w:ilvl")) { if (inNumPr) { String val = atts.getValue("w:val"); ilvl = Integer.valueOf(val); numPrCount[0] += (ilvl + 1) * 2; } } else if (qName.equals("w:numId")) { if (inNumPr && hasPStyle) { String val = atts.getValue("w:val") + "_" + ilvl; numIdMap.putIfAbsent(val, new AtomicInteger(0)); numIdMap.get(val).incrementAndGet(); } } else if (qName.equals("w:pict")) { inPicture = true; } } @Override public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if (qName.equals("w:tabs")) { inTabs = false; } else if (qName.equals("w:pPr")) { inPPr = false; hasPStyle = false; pStyle = null; } else if (qName.equals("w:numPr")) { inNumPr = false; ilvl = null; } else if (qName.equals("w:t")) { inTextElement = false; } else if (qName.equals("w:pict")) { inPicture = false; } } @Override public void characters(char[] ch, int start, int length) { if (!inTextElement || inPicture) return; if (length >= 767) { // word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入 String text = new String(ch, start, length); text = text.replaceAll(" {767,}", ""); length = text.length(); } cnt[0] += length; if (isDebug) { String text = new String(ch, start, length); stringBuilder2.append(text); } } public void ignorableWhitespace(char ch[], int start, int length) throws SAXException { } } }