ttml to srt conversion

rewrite SubtitleConverter (use JSoup library instead, remove unused methods)
This commit is contained in:
kapodamy 2020-01-14 00:00:45 -03:00
parent 9f47a274a8
commit 42ec6f0810
4 changed files with 100 additions and 397 deletions

View file

@ -832,7 +832,6 @@ public class DownloadDialog extends DialogFragment implements RadioGroup.OnCheck
psArgs = new String[]{
selectedStream.getFormat().getSuffix(),
"false",// ignore empty frames
"false",// detect youtube duplicate lines
};
}
break;

View file

@ -0,0 +1,95 @@
package org.schabi.newpipe.streams;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;
import org.schabi.newpipe.streams.io.SharpStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.text.ParseException;
/**
* @author kapodamy
*/
public class SrtFromTtmlWriter {
private static final String NEW_LINE = "\r\n";
private SharpStream out;
private boolean ignoreEmptyFrames;
private final Charset charset = Charset.forName("utf-8");
private int frameIndex = 0;
public SrtFromTtmlWriter(SharpStream out, boolean ignoreEmptyFrames) {
this.out = out;
this.ignoreEmptyFrames = true;
}
private static String getTimestamp(Element frame, String attr) {
return frame
.attr(attr)
.replace('.', ',');// Str uses comma as decimal separator
}
private void writeFrame(String begin, String end, StringBuilder text) throws IOException {
writeString(String.valueOf(frameIndex++));
writeString(NEW_LINE);
writeString(begin);
writeString(" --> ");
writeString(end);
writeString(NEW_LINE);
writeString(text.toString());
writeString(NEW_LINE);
writeString(NEW_LINE);
}
private void writeString(String text) throws IOException {
out.write(text.getBytes(charset));
}
public void build(SharpStream ttml) throws IOException {
/*
* TTML parser with BASIC support
* multiple CUE is not supported
* styling is not supported
* tag timestamps (in auto-generated subtitles) are not supported, maybe in the future
* also TimestampTagOption enum is not applicable
* Language parsing is not supported
*/
// parse XML
byte[] buffer = new byte[(int) ttml.available()];
ttml.read(buffer);
Document doc = Jsoup.parse(new ByteArrayInputStream(buffer), "UTF-8", "", Parser.xmlParser());
StringBuilder text = new StringBuilder(128);
Elements paragraph_list = doc.select("body>div>p");
// check if has frames
if (paragraph_list.size() < 1) return;
for (Element paragraph : paragraph_list) {
text.setLength(0);
for (Node children : paragraph.childNodes()) {
if (children instanceof TextNode)
text.append(((TextNode) children).text());
else if (children instanceof Element && ((Element) children).tagName().equalsIgnoreCase("br"))
text.append(NEW_LINE);
}
if (ignoreEmptyFrames && text.length() < 1) continue;
String begin = getTimestamp(paragraph, "begin");
String end = getTimestamp(paragraph, "end");
writeFrame(begin, end, text);
}
}
}

View file

@ -1,369 +0,0 @@
package org.schabi.newpipe.streams;
import org.schabi.newpipe.streams.io.SharpStream;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.Locale;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;
/**
* @author kapodamy
*/
public class SubtitleConverter {
private static final String NEW_LINE = "\r\n";
public void dumpTTML(SharpStream in, final SharpStream out, final boolean ignoreEmptyFrames, final boolean detectYoutubeDuplicateLines
) throws IOException, ParseException, SAXException, ParserConfigurationException, XPathExpressionException {
final FrameWriter callback = new FrameWriter() {
int frameIndex = 0;
final Charset charset = Charset.forName("utf-8");
@Override
public void yield(SubtitleFrame frame) throws IOException {
if (ignoreEmptyFrames && frame.isEmptyText()) {
return;
}
out.write(String.valueOf(frameIndex++).getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
out.write(getTime(frame.start, true).getBytes(charset));
out.write(" --> ".getBytes(charset));
out.write(getTime(frame.end, true).getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
out.write(frame.text.getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
out.write(NEW_LINE.getBytes(charset));
}
};
read_xml_based(in, callback, detectYoutubeDuplicateLines,
"tt", "xmlns", "http://www.w3.org/ns/ttml",
new String[]{"timedtext", "head", "wp"},
new String[]{"body", "div", "p"},
"begin", "end", true
);
}
private void read_xml_based(SharpStream source, FrameWriter callback, boolean detectYoutubeDuplicateLines,
String root, String formatAttr, String formatVersion, String[] cuePath, String[] framePath,
String timeAttr, String durationAttr, boolean hasTimestamp
) throws IOException, ParseException, SAXException, ParserConfigurationException, XPathExpressionException {
/*
* XML based subtitles parser with BASIC support
* multiple CUE is not supported
* styling is not supported
* tag timestamps (in auto-generated subtitles) are not supported, maybe in the future
* also TimestampTagOption enum is not applicable
* Language parsing is not supported
*/
byte[] buffer = new byte[(int) source.available()];
source.read(buffer);
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document xml = builder.parse(new ByteArrayInputStream(buffer));
String attr;
// get the format version or namespace
Element node = xml.getDocumentElement();
if (node == null) {
throw new ParseException("Can't get the format version. ¿wrong namespace?", -1);
} else if (!node.getNodeName().equals(root)) {
throw new ParseException("Invalid root", -1);
}
if (formatAttr.equals("xmlns")) {
if (!node.getNamespaceURI().equals(formatVersion)) {
throw new UnsupportedOperationException("Expected xml namespace: " + formatVersion);
}
} else {
attr = node.getAttributeNS(formatVersion, formatAttr);
if (attr == null) {
throw new ParseException("Can't get the format attribute", -1);
}
if (!attr.equals(formatVersion)) {
throw new ParseException("Invalid format version : " + attr, -1);
}
}
NodeList node_list;
int line_break = 0;// Maximum characters per line if present (valid for TranScript v3)
if (!hasTimestamp) {
node_list = selectNodes(xml, cuePath, formatVersion);
if (node_list != null) {
// if the subtitle has multiple CUEs, use the highest value
for (int i = 0; i < node_list.getLength(); i++) {
try {
int tmp = Integer.parseInt(((Element) node_list.item(i)).getAttributeNS(formatVersion, "ah"));
if (tmp > line_break) {
line_break = tmp;
}
} catch (Exception err) {
}
}
}
}
// parse every frame
node_list = selectNodes(xml, framePath, formatVersion);
if (node_list == null) {
return;// no frames detected
}
int fs_ff = -1;// first timestamp of first frame
boolean limit_lines = false;
for (int i = 0; i < node_list.getLength(); i++) {
Element elem = (Element) node_list.item(i);
SubtitleFrame obj = new SubtitleFrame();
obj.text = elem.getTextContent();
attr = elem.getAttribute(timeAttr);// ¡this cant be null!
obj.start = hasTimestamp ? parseTimestamp(attr) : Integer.parseInt(attr);
attr = elem.getAttribute(durationAttr);
if (obj.text == null || attr == null) {
continue;// normally is a blank line (on auto-generated subtitles) ignore
}
if (hasTimestamp) {
obj.end = parseTimestamp(attr);
if (detectYoutubeDuplicateLines) {
if (limit_lines) {
int swap = obj.end;
obj.end = fs_ff;
fs_ff = swap;
} else {
if (fs_ff < 0) {
fs_ff = obj.end;
} else {
if (fs_ff < obj.start) {
limit_lines = true;// the subtitles has duplicated lines
} else {
detectYoutubeDuplicateLines = false;
}
}
}
}
} else {
obj.end = obj.start + Integer.parseInt(attr);
}
if (/*node.getAttribute("w").equals("1") &&*/line_break > 1 && obj.text.length() > line_break) {
// implement auto line breaking (once)
StringBuilder text = new StringBuilder(obj.text);
obj.text = null;
switch (text.charAt(line_break)) {
case ' ':
case '\t':
putBreakAt(line_break, text);
break;
default:// find the word start position
for (int j = line_break - 1; j > 0; j--) {
switch (text.charAt(j)) {
case ' ':
case '\t':
putBreakAt(j, text);
j = -1;
break;
case '\r':
case '\n':
j = -1;// long word, just ignore
break;
}
}
break;
}
obj.text = text.toString();// set the processed text
}
callback.yield(obj);
}
}
private static NodeList selectNodes(Document xml, String[] path, String namespaceUri) {
Element ref = xml.getDocumentElement();
for (int i = 0; i < path.length - 1; i++) {
NodeList nodes = ref.getChildNodes();
if (nodes.getLength() < 1) {
return null;
}
Element elem;
for (int j = 0; j < nodes.getLength(); j++) {
if (nodes.item(j).getNodeType() == Node.ELEMENT_NODE) {
elem = (Element) nodes.item(j);
if (elem.getNodeName().equals(path[i]) && elem.getNamespaceURI().equals(namespaceUri)) {
ref = elem;
break;
}
}
}
}
return ref.getElementsByTagNameNS(namespaceUri, path[path.length - 1]);
}
private static int parseTimestamp(String multiImpl) throws NumberFormatException, ParseException {
if (multiImpl.length() < 1) {
return 0;
} else if (multiImpl.length() == 1) {
return Integer.parseInt(multiImpl) * 1000;// ¡this must be a number in seconds!
}
// detect wallclock-time
if (multiImpl.startsWith("wallclock(")) {
throw new UnsupportedOperationException("Parsing wallclock timestamp is not implemented");
}
// detect offset-time
if (multiImpl.indexOf(':') < 0) {
int multiplier = 1000;
char metric = multiImpl.charAt(multiImpl.length() - 1);
switch (metric) {
case 'h':
multiplier *= 3600000;
break;
case 'm':
multiplier *= 60000;
break;
case 's':
if (multiImpl.charAt(multiImpl.length() - 2) == 'm') {
multiplier = 1;// ms
}
break;
default:
if (!Character.isDigit(metric)) {
throw new NumberFormatException("Invalid metric suffix found on : " + multiImpl);
}
metric = '\0';
break;
}
try {
String offset_time = multiImpl;
if (multiplier == 1) {
offset_time = offset_time.substring(0, offset_time.length() - 2);
} else if (metric != '\0') {
offset_time = offset_time.substring(0, offset_time.length() - 1);
}
double time_metric_based = Double.parseDouble(offset_time);
if (Math.abs(time_metric_based) <= Double.MAX_VALUE) {
return (int) (time_metric_based * multiplier);
}
} catch (Exception err) {
throw new UnsupportedOperationException("Invalid or not implemented timestamp on: " + multiImpl);
}
}
// detect clock-time
int time = 0;
String[] units = multiImpl.split(":");
if (units.length < 3) {
throw new ParseException("Invalid clock-time timestamp", -1);
}
time += Integer.parseInt(units[0]) * 3600000;// hours
time += Integer.parseInt(units[1]) * 60000;//minutes
time += Float.parseFloat(units[2]) * 1000f;// seconds and milliseconds (if present)
// frames and sub-frames are ignored (not implemented)
// time += units[3] * fps;
return time;
}
private static void putBreakAt(int idx, StringBuilder str) {
// this should be optimized at compile time
if (NEW_LINE.length() > 1) {
str.delete(idx, idx + 1);// remove after replace
str.insert(idx, NEW_LINE);
} else {
str.setCharAt(idx, NEW_LINE.charAt(0));
}
}
private static String getTime(int time, boolean comma) {
// cast every value to integer to avoid auto-round in ToString("00").
StringBuilder str = new StringBuilder(12);
str.append(numberToString(time / 1000 / 3600, 2));// hours
str.append(':');
str.append(numberToString(time / 1000 / 60 % 60, 2));// minutes
str.append(':');
str.append(numberToString(time / 1000 % 60, 2));// seconds
str.append(comma ? ',' : '.');
str.append(numberToString(time % 1000, 3));// miliseconds
return str.toString();
}
private static String numberToString(int nro, int pad) {
return String.format(Locale.ENGLISH, "%0".concat(String.valueOf(pad)).concat("d"), nro);
}
/******************
* helper classes *
******************/
private interface FrameWriter {
void yield(SubtitleFrame frame) throws IOException;
}
private static class SubtitleFrame {
//Java no support unsigned int
public int end;
public int start;
public String text = "";
private boolean isEmptyText() {
if (text == null) {
return true;
}
for (int i = 0; i < text.length(); i++) {
switch (text.charAt(i)) {
case ' ':
case '\t':
case '\r':
case '\n':
break;
default:
return false;
}
}
return true;
}
}
}

View file

@ -2,15 +2,10 @@ package us.shandian.giga.postprocessing;
import android.util.Log;
import org.schabi.newpipe.streams.SubtitleConverter;
import org.schabi.newpipe.streams.SrtFromTtmlWriter;
import org.schabi.newpipe.streams.io.SharpStream;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.text.ParseException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;
/**
* @author kapodamy
@ -27,33 +22,16 @@ class TtmlConverter extends Postprocessing {
int process(SharpStream out, SharpStream... sources) throws IOException {
// check if the subtitle is already in srt and copy, this should never happen
String format = getArgumentAt(0, null);
boolean ignoreEmptyFrames = getArgumentAt(1, "true").equals("true");
if (format == null || format.equals("ttml")) {
SubtitleConverter ttmlDumper = new SubtitleConverter();
SrtFromTtmlWriter writer = new SrtFromTtmlWriter(out, ignoreEmptyFrames);
try {
ttmlDumper.dumpTTML(
sources[0],
out,
getArgumentAt(1, "true").equals("true"),
getArgumentAt(2, "true").equals("true")
);
writer.build(sources[0]);
} catch (Exception err) {
Log.e(TAG, "subtitle parse failed", err);
if (err instanceof IOException) {
return 1;
} else if (err instanceof ParseException) {
return 2;
} else if (err instanceof SAXException) {
return 3;
} else if (err instanceof ParserConfigurationException) {
return 4;
} else if (err instanceof XPathExpressionException) {
return 7;
}
return 8;
return err instanceof IOException ? 1 : 8;
}
return OK_RESULT;