restructure parser

This commit is contained in:
Christian Schabesberger 2016-02-02 18:43:20 +01:00
parent bad576c23d
commit d097363b24
12 changed files with 365 additions and 274 deletions

View file

@ -72,7 +72,7 @@ public class VideoItemDetailActivity extends AppCompatActivity {
StreamingService[] serviceList = ServiceList.getServices(); StreamingService[] serviceList = ServiceList.getServices();
//VideoExtractor videoExtractor = null; //VideoExtractor videoExtractor = null;
for (int i = 0; i < serviceList.length; i++) { for (int i = 0; i < serviceList.length; i++) {
if (serviceList[i].acceptUrl(videoUrl)) { if (serviceList[i].getUrlIdHandler().acceptUrl(videoUrl)) {
arguments.putInt(VideoItemDetailFragment.STREAMING_SERVICE, i); arguments.putInt(VideoItemDetailFragment.STREAMING_SERVICE, i);
currentStreamingService = i; currentStreamingService = i;
//videoExtractor = ServiceList.getService(i).getExtractorInstance(); //videoExtractor = ServiceList.getService(i).getExtractorInstance();

View file

@ -112,7 +112,7 @@ public class VideoItemDetailFragment extends Fragment {
public void run() { public void run() {
try { try {
videoExtractor = service.getExtractorInstance(videoUrl, new Downloader()); videoExtractor = service.getExtractorInstance(videoUrl, new Downloader());
VideoInfo videoInfo = videoExtractor.getVideoInfo(); VideoInfo videoInfo = VideoInfo.getVideoInfo(videoExtractor, new Downloader());
h.post(new VideoResultReturnedRunnable(videoInfo)); h.post(new VideoResultReturnedRunnable(videoInfo));
h.post(new SetThumbnailRunnable( h.post(new SetThumbnailRunnable(
//todo: make bitmaps not bypass tor //todo: make bitmaps not bypass tor

View file

@ -0,0 +1,101 @@
package org.schabi.newpipe.crawler;
import android.util.Xml;
import org.xmlpull.v1.XmlPullParser;
import java.io.IOException;
import java.io.StringReader;
import java.util.Vector;
/**
* Created by Christian Schabesberger on 02.02.16.
*
* Copyright (C) Christian Schabesberger 2016 <chris.schabesberger@mailbox.org>
* DashMpdParser.java is part of NewPipe.
*
* NewPipe is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* NewPipe is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with NewPipe. If not, see <http://www.gnu.org/licenses/>.
*/
public class DashMpdParser {
static class DashMpdParsingException extends ParsingException {
DashMpdParsingException(String message, Exception e) {
super(message, e);
}
}
public static VideoInfo.AudioStream[] getAudioStreams(String dashManifestUrl,
Downloader downloader)
throws DashMpdParsingException {
String dashDoc;
try {
dashDoc = downloader.download(dashManifestUrl);
} catch(IOException ioe) {
throw new DashMpdParsingException("Could not get dash mpd: " + dashManifestUrl, ioe);
}
Vector<VideoInfo.AudioStream> audioStreams = new Vector<>();
try {
XmlPullParser parser = Xml.newPullParser();
parser.setInput(new StringReader(dashDoc));
String tagName = "";
String currentMimeType = "";
int currentBandwidth = -1;
int currentSamplingRate = -1;
boolean currentTagIsBaseUrl = false;
for(int eventType = parser.getEventType();
eventType != XmlPullParser.END_DOCUMENT;
eventType = parser.next() ) {
switch(eventType) {
case XmlPullParser.START_TAG:
tagName = parser.getName();
if(tagName.equals("AdaptationSet")) {
currentMimeType = parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "mimeType");
} else if(tagName.equals("Representation") && currentMimeType.contains("audio")) {
currentBandwidth = Integer.parseInt(
parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "bandwidth"));
currentSamplingRate = Integer.parseInt(
parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "audioSamplingRate"));
} else if(tagName.equals("BaseURL")) {
currentTagIsBaseUrl = true;
}
break;
case XmlPullParser.TEXT:
if(currentTagIsBaseUrl &&
(currentMimeType.contains("audio"))) {
int format = -1;
if(currentMimeType.equals(MediaFormat.WEBMA.mimeType)) {
format = MediaFormat.WEBMA.id;
} else if(currentMimeType.equals(MediaFormat.M4A.mimeType)) {
format = MediaFormat.M4A.id;
}
audioStreams.add(new VideoInfo.AudioStream(parser.getText(),
format, currentBandwidth, currentSamplingRate));
}
break;
case XmlPullParser.END_TAG:
if(tagName.equals("AdaptationSet")) {
currentMimeType = "";
} else if(tagName.equals("BaseURL")) {
currentTagIsBaseUrl = false;
}//no break needed here
}
}
} catch(Exception e) {
throw new DashMpdParsingException("Could not parse Dash mpd", e);
}
return audioStreams.toArray(new VideoInfo.AudioStream[audioStreams.size()]);
}
}

View file

@ -0,0 +1,47 @@
package org.schabi.newpipe.crawler;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by Christian Schabesberger on 02.02.16.
*
* Copyright (C) Christian Schabesberger 2016 <chris.schabesberger@mailbox.org>
* RegexHelper.java is part of NewPipe.
*
* NewPipe is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* NewPipe is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with NewPipe. If not, see <http://www.gnu.org/licenses/>.
*/
/** avoid using regex !!! */
public class RegexHelper {
public static class RegexException extends ParsingException {
public RegexException(String message) {
super(message);
}
}
public static String matchGroup1(String pattern, String input) throws RegexException {
Pattern pat = Pattern.compile(pattern);
Matcher mat = pat.matcher(input);
boolean foundMatch = mat.find();
if (foundMatch) {
return mat.group(1);
}
else {
//Log.e(TAG, "failed to find pattern \""+pattern+"\" inside of \""+input+"\"");
throw new RegexException("failed to find pattern \""+pattern+" inside of "+input+"\"");
}
}
}

View file

@ -27,11 +27,11 @@ public interface StreamingService {
public String name = ""; public String name = "";
} }
ServiceInfo getServiceInfo(); ServiceInfo getServiceInfo();
VideoExtractor getExtractorInstance(String url, Downloader downloader) throws IOException, CrawlingException; VideoExtractor getExtractorInstance(String url, Downloader downloader)
throws IOException, CrawlingException;
SearchEngine getSearchEngineInstance(); SearchEngine getSearchEngineInstance();
/**When a VIEW_ACTION is caught this function will test if the url delivered within the calling UrlIdHandler getUrlIdHandler();
Intent was meant to be watched with this Service.
Return false if this service shall not allow to be called through ACTIONs.*/
boolean acceptUrl(String videoUrl);
} }

View file

@ -0,0 +1,32 @@
package org.schabi.newpipe.crawler;
/**
* Created by Christian Schabesberger on 02.02.16.
*
* Copyright (C) Christian Schabesberger 2016 <chris.schabesberger@mailbox.org>
* UrlIdHandler.java is part of NewPipe.
*
* NewPipe is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* NewPipe is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with NewPipe. If not, see <http://www.gnu.org/licenses/>.
*/
public interface UrlIdHandler {
String getVideoUrl(String videoId);
String getVideoId(String siteUrl) throws ParsingException;
String cleanUrl(String siteUrl) throws ParsingException;
/**When a VIEW_ACTION is caught this function will test if the url delivered within the calling
Intent was meant to be watched with this Service.
Return false if this service shall not allow to be called through ACTIONs.*/
boolean acceptUrl(String videoUrl);
}

View file

@ -20,14 +20,14 @@ package org.schabi.newpipe.crawler;
* along with NewPipe. If not, see <http://www.gnu.org/licenses/>. * along with NewPipe. If not, see <http://www.gnu.org/licenses/>.
*/ */
import java.util.List; import java.net.URL;
import java.util.Vector; import java.util.Vector;
/**Scrapes information from a video streaming service (eg, YouTube).*/ /**Scrapes information from a video streaming service (eg, YouTube).*/
@SuppressWarnings("ALL") @SuppressWarnings("ALL")
public abstract class VideoExtractor { public interface VideoExtractor {
public class ExctractorInitException extends CrawlingException { public class ExctractorInitException extends CrawlingException {
public ExctractorInitException() {} public ExctractorInitException() {}
@ -42,13 +42,6 @@ public abstract class VideoExtractor {
} }
} }
public class RegexException extends ParsingException {
public RegexException() {}
public RegexException(String message) {
super(message);
}
}
public class ContentNotAvailableException extends ParsingException { public class ContentNotAvailableException extends ParsingException {
public ContentNotAvailableException() {} public ContentNotAvailableException() {}
public ContentNotAvailableException(String message) { public ContentNotAvailableException(String message) {
@ -62,111 +55,6 @@ public abstract class VideoExtractor {
} }
} }
protected final String pageUrl;
protected VideoInfo videoInfo;
@SuppressWarnings("WeakerAccess")
public VideoExtractor(String url, Downloader dl) {
this.pageUrl = url;
}
/**Fills out the video info fields which are common to all services.
* Probably needs to be overridden by subclasses*/
public VideoInfo getVideoInfo() throws CrawlingException
{
if(videoInfo == null) {
videoInfo = new VideoInfo();
}
if(videoInfo.webpage_url.isEmpty()) {
videoInfo.webpage_url = pageUrl;
}
if (videoInfo.title.isEmpty()) {
videoInfo.title = getTitle();
}
if (videoInfo.duration < 1) {
videoInfo.duration = getLength();
}
if (videoInfo.uploader.isEmpty()) {
videoInfo.uploader = getUploader();
}
if (videoInfo.description.isEmpty()) {
videoInfo.description = getDescription();
}
if (videoInfo.view_count == -1) {
videoInfo.view_count = getViews();
}
if (videoInfo.upload_date.isEmpty()) {
videoInfo.upload_date = getUploadDate();
}
if (videoInfo.thumbnail_url.isEmpty()) {
videoInfo.thumbnail_url = getThumbnailUrl();
}
if (videoInfo.id.isEmpty()) {
videoInfo.id = getVideoId(pageUrl);
}
/** Load and extract audio*/
if (videoInfo.audioStreams == null) {
videoInfo.audioStreams = getAudioStreams();
}
/** Extract video stream url*/
if (videoInfo.videoStreams == null) {
videoInfo.videoStreams = getVideoStreams();
}
if (videoInfo.uploader_thumbnail_url.isEmpty()) {
videoInfo.uploader_thumbnail_url = getUploaderThumbnailUrl();
}
if (videoInfo.startPosition < 0) {
videoInfo.startPosition = getTimeStamp();
}
if(videoInfo.dashMpdUrl.isEmpty()) {
videoInfo.dashMpdUrl = getDashMpdUrl();
}
if(videoInfo.average_rating.isEmpty()) {
videoInfo.average_rating = getAverageRating();
}
if(videoInfo.like_count == -1) {
videoInfo.like_count = getLikeCount();
}
if(videoInfo.dislike_count == -1) {
videoInfo.dislike_count = getDislikeCount();
}
if(videoInfo.nextVideo == null) {
videoInfo.nextVideo = getNextVideo();
}
if(videoInfo.relatedVideos == null) {
videoInfo.relatedVideos = getRelatedVideos();
}
//Bitmap thumbnail = null;
//Bitmap uploader_thumbnail = null;
//int videoAvailableStatus = VIDEO_AVAILABLE;
return videoInfo;
}
//todo: remove these functions, or make them static, otherwise its useles, to have them here
public abstract String getVideoUrl(String videoId);
public abstract String getVideoId(String siteUrl) throws ParsingException;
///////////////////////////////////////////////////////////////////////////////////////////
public abstract int getTimeStamp() throws ParsingException; public abstract int getTimeStamp() throws ParsingException;
public abstract String getTitle() throws ParsingException; public abstract String getTitle() throws ParsingException;
public abstract String getDescription() throws ParsingException; public abstract String getDescription() throws ParsingException;
@ -185,4 +73,6 @@ public abstract class VideoExtractor {
public abstract int getDislikeCount() throws ParsingException; public abstract int getDislikeCount() throws ParsingException;
public abstract VideoPreviewInfo getNextVideo() throws ParsingException; public abstract VideoPreviewInfo getNextVideo() throws ParsingException;
public abstract Vector<VideoPreviewInfo> getRelatedVideos() throws ParsingException; public abstract Vector<VideoPreviewInfo> getRelatedVideos() throws ParsingException;
public abstract UrlIdHandler getUrlIdConverter();
public abstract String getPageUrl();
} }

View file

@ -1,5 +1,6 @@
package org.schabi.newpipe.crawler; package org.schabi.newpipe.crawler;
import java.io.IOException;
import java.util.List; import java.util.List;
/** /**
@ -26,8 +27,52 @@ import java.util.List;
@SuppressWarnings("ALL") @SuppressWarnings("ALL")
public class VideoInfo extends AbstractVideoInfo { public class VideoInfo extends AbstractVideoInfo {
/**Fills out the video info fields which are common to all services.
* Probably needs to be overridden by subclasses*/
public static VideoInfo getVideoInfo(VideoExtractor extractor, Downloader downloader)
throws CrawlingException, IOException {
VideoInfo videoInfo = new VideoInfo();
UrlIdHandler uiconv = extractor.getUrlIdConverter();
videoInfo.webpage_url = extractor.getPageUrl();
videoInfo.title = extractor.getTitle();
videoInfo.duration = extractor.getLength();
videoInfo.uploader = extractor.getUploader();
videoInfo.description = extractor.getDescription();
videoInfo.view_count = extractor.getViews();
videoInfo.upload_date = extractor.getUploadDate();
videoInfo.thumbnail_url = extractor.getThumbnailUrl();
videoInfo.id = uiconv.getVideoId(extractor.getPageUrl());
videoInfo.dashMpdUrl = extractor.getDashMpdUrl();
/** Load and extract audio*/
videoInfo.audioStreams = extractor.getAudioStreams();
if(videoInfo.dashMpdUrl != null && !videoInfo.dashMpdUrl.isEmpty()) {
if(videoInfo.audioStreams == null || videoInfo.audioStreams.length == 0) {
videoInfo.audioStreams =
DashMpdParser.getAudioStreams(videoInfo.dashMpdUrl, downloader);
}
}
/** Extract video stream url*/
videoInfo.videoStreams = extractor.getVideoStreams();
videoInfo.uploader_thumbnail_url = extractor.getUploaderThumbnailUrl();
videoInfo.startPosition = extractor.getTimeStamp();
videoInfo.average_rating = extractor.getAverageRating();
videoInfo.like_count = extractor.getLikeCount();
videoInfo.dislike_count = extractor.getDislikeCount();
videoInfo.nextVideo = extractor.getNextVideo();
videoInfo.relatedVideos = extractor.getRelatedVideos();
//Bitmap thumbnail = null;
//Bitmap uploader_thumbnail = null;
//int videoAvailableStatus = VIDEO_AVAILABLE;
return videoInfo;
}
public String uploader_thumbnail_url = ""; public String uploader_thumbnail_url = "";
public String description = ""; public String description = "";
/*todo: make this lists over vectors*/
public VideoStream[] videoStreams = null; public VideoStream[] videoStreams = null;
public AudioStream[] audioStreams = null; public AudioStream[] audioStreams = null;
// video streams provided by the dash mpd do not need to be provided as VideoStream. // video streams provided by the dash mpd do not need to be provided as VideoStream.

View file

@ -52,7 +52,8 @@ public class YoutubeSearchEngine implements SearchEngine {
private static final String TAG = YoutubeSearchEngine.class.toString(); private static final String TAG = YoutubeSearchEngine.class.toString();
@Override @Override
public Result search(String query, int page, String languageCode, Downloader downloader) throws IOException, ParsingException { public Result search(String query, int page, String languageCode, Downloader downloader)
throws IOException, ParsingException {
Result result = new Result(); Result result = new Result();
Uri.Builder builder = new Uri.Builder(); Uri.Builder builder = new Uri.Builder();
builder.scheme("https") builder.scheme("https")
@ -171,7 +172,8 @@ public class YoutubeSearchEngine implements SearchEngine {
try { try {
dBuilder = dbFactory.newDocumentBuilder(); dBuilder = dbFactory.newDocumentBuilder();
doc = dBuilder.parse(new InputSource(new ByteArrayInputStream(response.getBytes("utf-8")))); doc = dBuilder.parse(new InputSource(
new ByteArrayInputStream(response.getBytes("utf-8"))));
doc.getDocumentElement().normalize(); doc.getDocumentElement().normalize();
} catch (ParserConfigurationException | SAXException | IOException e) { } catch (ParserConfigurationException | SAXException | IOException e) {
e.printStackTrace(); e.printStackTrace();

View file

@ -3,6 +3,7 @@ package org.schabi.newpipe.crawler.services.youtube;
import org.schabi.newpipe.crawler.CrawlingException; import org.schabi.newpipe.crawler.CrawlingException;
import org.schabi.newpipe.crawler.Downloader; import org.schabi.newpipe.crawler.Downloader;
import org.schabi.newpipe.crawler.StreamingService; import org.schabi.newpipe.crawler.StreamingService;
import org.schabi.newpipe.crawler.UrlIdHandler;
import org.schabi.newpipe.crawler.VideoExtractor; import org.schabi.newpipe.crawler.VideoExtractor;
import org.schabi.newpipe.crawler.SearchEngine; import org.schabi.newpipe.crawler.SearchEngine;
@ -37,8 +38,10 @@ public class YoutubeService implements StreamingService {
return serviceInfo; return serviceInfo;
} }
@Override @Override
public VideoExtractor getExtractorInstance(String url, Downloader downloader) throws CrawlingException, IOException { public VideoExtractor getExtractorInstance(String url, Downloader downloader)
if(acceptUrl(url)) { throws CrawlingException, IOException {
UrlIdHandler urlIdHandler = new YoutubeUrlIdHandler();
if(urlIdHandler.acceptUrl(url)) {
return new YoutubeVideoExtractor(url, downloader) ; return new YoutubeVideoExtractor(url, downloader) ;
} }
else { else {
@ -49,9 +52,9 @@ public class YoutubeService implements StreamingService {
public SearchEngine getSearchEngineInstance() { public SearchEngine getSearchEngineInstance() {
return new YoutubeSearchEngine(); return new YoutubeSearchEngine();
} }
@Override @Override
public boolean acceptUrl(String videoUrl) { public UrlIdHandler getUrlIdHandler() {
return videoUrl.contains("youtube") || return new YoutubeUrlIdHandler();
videoUrl.contains("youtu.be");
} }
} }

View file

@ -0,0 +1,68 @@
package org.schabi.newpipe.crawler.services.youtube;
import org.schabi.newpipe.crawler.ParsingException;
import org.schabi.newpipe.crawler.RegexHelper;
import org.schabi.newpipe.crawler.UrlIdHandler;
/**
* Created by Christian Schabesberger on 02.02.16.
*
* Copyright (C) Christian Schabesberger 2016 <chris.schabesberger@mailbox.org>
* YoutubeUrlIdHandler.java is part of NewPipe.
*
* NewPipe is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* NewPipe is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with NewPipe. If not, see <http://www.gnu.org/licenses/>.
*/
public class YoutubeUrlIdHandler implements UrlIdHandler {
@SuppressWarnings("WeakerAccess")
@Override
public String getVideoUrl(String videoId) {
return "https://www.youtube.com/watch?v=" + videoId;
}
@SuppressWarnings("WeakerAccess")
@Override
public String getVideoId(String url) throws ParsingException {
String id;
String pat;
if(url.contains("youtube")) {
pat = "youtube\\.com/watch\\?v=([\\-a-zA-Z0-9_]{11})";
}
else if(url.contains("youtu.be")) {
pat = "youtu\\.be/([a-zA-Z0-9_-]{11})";
}
else {
throw new ParsingException("Error no suitable url: " + url);
}
id = RegexHelper.matchGroup1(pat, url);
if(!id.isEmpty()){
//Log.i(TAG, "string \""+url+"\" matches!");
return id;
} else {
throw new ParsingException("Error could not parse url: " + url);
}
}
public String cleanUrl(String complexUrl) throws ParsingException {
return getVideoUrl(getVideoId(complexUrl));
}
@Override
public boolean acceptUrl(String videoUrl) {
return videoUrl.contains("youtube") ||
videoUrl.contains("youtu.be");
}
}

View file

@ -15,6 +15,8 @@ import org.mozilla.javascript.ScriptableObject;
import org.schabi.newpipe.crawler.CrawlingException; import org.schabi.newpipe.crawler.CrawlingException;
import org.schabi.newpipe.crawler.Downloader; import org.schabi.newpipe.crawler.Downloader;
import org.schabi.newpipe.crawler.ParsingException; import org.schabi.newpipe.crawler.ParsingException;
import org.schabi.newpipe.crawler.RegexHelper;
import org.schabi.newpipe.crawler.UrlIdHandler;
import org.schabi.newpipe.crawler.VideoExtractor; import org.schabi.newpipe.crawler.VideoExtractor;
import org.schabi.newpipe.crawler.MediaFormat; import org.schabi.newpipe.crawler.MediaFormat;
import org.schabi.newpipe.crawler.VideoInfo; import org.schabi.newpipe.crawler.VideoInfo;
@ -25,11 +27,8 @@ import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Vector; import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** /**
* Created by Christian Schabesberger on 06.08.15. * Created by Christian Schabesberger on 06.08.15.
@ -51,7 +50,7 @@ import java.util.regex.Pattern;
* along with NewPipe. If not, see <http://www.gnu.org/licenses/>. * along with NewPipe. If not, see <http://www.gnu.org/licenses/>.
*/ */
public class YoutubeVideoExtractor extends VideoExtractor { public class YoutubeVideoExtractor implements VideoExtractor {
public class DecryptException extends ParsingException { public class DecryptException extends ParsingException {
DecryptException(Throwable cause) { DecryptException(Throwable cause) {
@ -75,7 +74,6 @@ public class YoutubeVideoExtractor extends VideoExtractor {
private static final String TAG = YoutubeVideoExtractor.class.toString(); private static final String TAG = YoutubeVideoExtractor.class.toString();
private final Document doc; private final Document doc;
private JSONObject playerArgs; private JSONObject playerArgs;
private String errorMessage = "";
// static values // static values
private static final String DECRYPTION_FUNC_NAME="decrypt"; private static final String DECRYPTION_FUNC_NAME="decrypt";
@ -83,23 +81,27 @@ public class YoutubeVideoExtractor extends VideoExtractor {
// cached values // cached values
private static volatile String decryptionCode = ""; private static volatile String decryptionCode = "";
UrlIdHandler urlidhandler = new YoutubeUrlIdHandler();
String pageUrl = "";
private Downloader downloader; private Downloader downloader;
public YoutubeVideoExtractor(String pageUrl, Downloader dl) throws CrawlingException, IOException { public YoutubeVideoExtractor(String pageUrl, Downloader dl) throws CrawlingException, IOException {
//most common videoInfo fields are now set in our superclass, for all services //most common videoInfo fields are now set in our superclass, for all services
super(pageUrl, dl);
downloader = dl; downloader = dl;
String pageContent = downloader.download(cleanUrl(pageUrl)); this.pageUrl = urlidhandler.cleanUrl(pageUrl);
String pageContent = downloader.download(this.pageUrl);
doc = Jsoup.parse(pageContent, pageUrl); doc = Jsoup.parse(pageContent, pageUrl);
String ytPlayerConfigRaw; String ytPlayerConfigRaw;
JSONObject ytPlayerConfig; JSONObject ytPlayerConfig;
//attempt to load the youtube js player JSON arguments //attempt to load the youtube js player JSON arguments
try { try {
ytPlayerConfigRaw = matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", pageContent); ytPlayerConfigRaw =
RegexHelper.matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", pageContent);
ytPlayerConfig = new JSONObject(ytPlayerConfigRaw); ytPlayerConfig = new JSONObject(ytPlayerConfigRaw);
playerArgs = ytPlayerConfig.getJSONObject("args"); playerArgs = ytPlayerConfig.getJSONObject("args");
} catch (RegexException e) { } catch (RegexHelper.RegexException e) {
String errorReason = findErrorReason(doc); String errorReason = findErrorReason(doc);
switch(errorReason) { switch(errorReason) {
case "GEMA": case "GEMA":
@ -233,7 +235,16 @@ public class YoutubeVideoExtractor extends VideoExtractor {
@Override @Override
public String getDashMpdUrl() throws ParsingException { public String getDashMpdUrl() throws ParsingException {
try { try {
return playerArgs.getString("dashmpd"); String dashManifest = playerArgs.getString("dashmpd");
if(!dashManifest.contains("/signature/")) {
String encryptedSig = RegexHelper.matchGroup1("/s/([a-fA-F0-9\\.]+)", dashManifest);
String decryptedSig;
decryptedSig = decryptSignature(encryptedSig, decryptionCode);
dashManifest = dashManifest.replace("/s/" + encryptedSig, "/signature/" + decryptedSig);
}
return dashManifest;
} catch(NullPointerException e) { } catch(NullPointerException e) {
throw new ParsingException( throw new ParsingException(
"Could not find \"dashmpd\" upon the player args (maybe no dash manifest available).", e); "Could not find \"dashmpd\" upon the player args (maybe no dash manifest available).", e);
@ -244,15 +255,8 @@ public class YoutubeVideoExtractor extends VideoExtractor {
@Override @Override
public VideoInfo.AudioStream[] getAudioStreams() throws ParsingException { public VideoInfo.AudioStream[] getAudioStreams() throws ParsingException {
try { /* If we provide a valid dash manifest, we don't need to provide audio streams extra */
String dashManifest = playerArgs.getString("dashmpd"); return null;
return parseDashManifest(dashManifest, decryptionCode);
} catch (NullPointerException e) {
throw new ParsingException(
"Could not find \"dashmpd\" upon the player args (maybe no dash manifest available).", e);
} catch (Exception e) {
throw new ParsingException(e);
}
} }
@Override @Override
@ -300,37 +304,6 @@ public class YoutubeVideoExtractor extends VideoExtractor {
return videoStreams.toArray(new VideoInfo.VideoStream[videoStreams.size()]); return videoStreams.toArray(new VideoInfo.VideoStream[videoStreams.size()]);
} }
@SuppressWarnings("WeakerAccess")
@Override
public String getVideoId(String url) throws ParsingException {
String id;
String pat;
if(url.contains("youtube")) {
pat = "youtube\\.com/watch\\?v=([\\-a-zA-Z0-9_]{11})";
}
else if(url.contains("youtu.be")) {
pat = "youtu\\.be/([a-zA-Z0-9_-]{11})";
}
else {
throw new ParsingException("Error no suitable url: " + url);
}
id = matchGroup1(pat, url);
if(!id.isEmpty()){
//Log.i(TAG, "string \""+url+"\" matches!");
return id;
} else {
throw new ParsingException("Error could not parse url: " + url);
}
}
@SuppressWarnings("WeakerAccess")
@Override
public String getVideoUrl(String videoId) {
return "https://www.youtube.com/watch?v=" + videoId;
}
/**Attempts to parse (and return) the offset to start playing the video from. /**Attempts to parse (and return) the offset to start playing the video from.
* @return the offset (in seconds), or 0 if no timestamp is found.*/ * @return the offset (in seconds), or 0 if no timestamp is found.*/
@Override @Override
@ -338,8 +311,8 @@ public class YoutubeVideoExtractor extends VideoExtractor {
//todo: add unit test for timestamp //todo: add unit test for timestamp
String timeStamp; String timeStamp;
try { try {
timeStamp = matchGroup1("((#|&|\\?)t=\\d{0,3}h?\\d{0,3}m?\\d{1,3}s?)", pageUrl); timeStamp = RegexHelper.matchGroup1("((#|&|\\?)t=\\d{0,3}h?\\d{0,3}m?\\d{1,3}s?)", pageUrl);
} catch (RegexException e) { } catch (RegexHelper.RegexException e) {
// catch this instantly since an url does not necessarily have to have a time stamp // catch this instantly since an url does not necessarily have to have a time stamp
// -2 because well the testing system will then know its the regex that failed :/ // -2 because well the testing system will then know its the regex that failed :/
@ -354,15 +327,15 @@ public class YoutubeVideoExtractor extends VideoExtractor {
String minutesString = ""; String minutesString = "";
String hoursString = ""; String hoursString = "";
try { try {
secondsString = matchGroup1("(\\d{1,3})s", timeStamp); secondsString = RegexHelper.matchGroup1("(\\d{1,3})s", timeStamp);
minutesString = matchGroup1("(\\d{1,3})m", timeStamp); minutesString = RegexHelper.matchGroup1("(\\d{1,3})m", timeStamp);
hoursString = matchGroup1("(\\d{1,3})h", timeStamp); hoursString = RegexHelper.matchGroup1("(\\d{1,3})h", timeStamp);
} catch (Exception e) { } catch (Exception e) {
//it could be that time is given in another method //it could be that time is given in another method
if (secondsString.isEmpty() //if nothing was got, if (secondsString.isEmpty() //if nothing was got,
&& minutesString.isEmpty()//treat as unlabelled seconds && minutesString.isEmpty()//treat as unlabelled seconds
&& hoursString.isEmpty()) { && hoursString.isEmpty()) {
secondsString = matchGroup1("t=(\\d{1,3})", timeStamp); secondsString = RegexHelper.matchGroup1("t=(\\d{1,3})", timeStamp);
} }
} }
@ -455,72 +428,14 @@ public class YoutubeVideoExtractor extends VideoExtractor {
} }
} }
private VideoInfo.AudioStream[] parseDashManifest(String dashManifest, String decryptoinCode) throws RegexException, DecryptException { @Override
if(!dashManifest.contains("/signature/")) { public UrlIdHandler getUrlIdConverter() {
String encryptedSig = matchGroup1("/s/([a-fA-F0-9\\.]+)", dashManifest); return new YoutubeUrlIdHandler();
String decryptedSig; }
decryptedSig = decryptSignature(encryptedSig, decryptoinCode); @Override
dashManifest = dashManifest.replace("/s/" + encryptedSig, "/signature/" + decryptedSig); public String getPageUrl() {
} return pageUrl;
String dashDoc;
try {
dashDoc = downloader.download(dashManifest);
} catch(IOException ioe) {
throw new DecryptException("Could not get dash mpd", ioe);
}
Vector<VideoInfo.AudioStream> audioStreams = new Vector<>();
try {
XmlPullParser parser = Xml.newPullParser();
parser.setInput(new StringReader(dashDoc));
String tagName = "";
String currentMimeType = "";
int currentBandwidth = -1;
int currentSamplingRate = -1;
boolean currentTagIsBaseUrl = false;
for(int eventType = parser.getEventType();
eventType != XmlPullParser.END_DOCUMENT;
eventType = parser.next() ) {
switch(eventType) {
case XmlPullParser.START_TAG:
tagName = parser.getName();
if(tagName.equals("AdaptationSet")) {
currentMimeType = parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "mimeType");
} else if(tagName.equals("Representation") && currentMimeType.contains("audio")) {
currentBandwidth = Integer.parseInt(
parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "bandwidth"));
currentSamplingRate = Integer.parseInt(
parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "audioSamplingRate"));
} else if(tagName.equals("BaseURL")) {
currentTagIsBaseUrl = true;
}
break;
case XmlPullParser.TEXT:
if(currentTagIsBaseUrl &&
(currentMimeType.contains("audio"))) {
int format = -1;
if(currentMimeType.equals(MediaFormat.WEBMA.mimeType)) {
format = MediaFormat.WEBMA.id;
} else if(currentMimeType.equals(MediaFormat.M4A.mimeType)) {
format = MediaFormat.M4A.id;
}
audioStreams.add(new VideoInfo.AudioStream(parser.getText(),
format, currentBandwidth, currentSamplingRate));
}
//missing break here?
case XmlPullParser.END_TAG:
if(tagName.equals("AdaptationSet")) {
currentMimeType = "";
} else if(tagName.equals("BaseURL")) {
currentTagIsBaseUrl = false;
}//no break needed here
}
}
} catch(Exception e) {
e.printStackTrace();
}
return audioStreams.toArray(new VideoInfo.AudioStream[audioStreams.size()]);
} }
/**Provides information about links to other videos on the video page, such as related videos. /**Provides information about links to other videos on the video page, such as related videos.
@ -533,7 +448,7 @@ public class YoutubeVideoExtractor extends VideoExtractor {
info.webpage_url = li.select("a.content-link").first() info.webpage_url = li.select("a.content-link").first()
.attr("abs:href"); .attr("abs:href");
info.id = matchGroup1("v=([0-9a-zA-Z-]*)", info.webpage_url); info.id = RegexHelper.matchGroup1("v=([0-9a-zA-Z-]*)", info.webpage_url);
//todo: check NullPointerException causing //todo: check NullPointerException causing
info.title = li.select("span.title").first().text(); info.title = li.select("span.title").first().text();
@ -584,15 +499,20 @@ public class YoutubeVideoExtractor extends VideoExtractor {
try { try {
String playerCode = downloader.download(playerUrl); String playerCode = downloader.download(playerUrl);
decryptionFuncName = matchGroup1("\\.sig\\|\\|([a-zA-Z0-9$]+)\\(", playerCode); decryptionFuncName =
RegexHelper.matchGroup1("\\.sig\\|\\|([a-zA-Z0-9$]+)\\(", playerCode);
String functionPattern = "(" + decryptionFuncName.replace("$", "\\$") + "=function\\([a-zA-Z0-9_]*\\)\\{.+?\\})"; String functionPattern = "("
decryptionFunc = "var " + matchGroup1(functionPattern, playerCode) + ";"; + decryptionFuncName.replace("$", "\\$")
+ "=function\\([a-zA-Z0-9_]*\\)\\{.+?\\})";
decryptionFunc = "var " + RegexHelper.matchGroup1(functionPattern, playerCode) + ";";
helperObjectName = matchGroup1(";([A-Za-z0-9_\\$]{2})\\...\\(", decryptionFunc); helperObjectName = RegexHelper
.matchGroup1(";([A-Za-z0-9_\\$]{2})\\...\\(", decryptionFunc);
String helperPattern = "(var " + helperObjectName.replace("$", "\\$") + "=\\{.+?\\}\\};)"; String helperPattern = "(var "
helperObject = matchGroup1(helperPattern, playerCode); + helperObjectName.replace("$", "\\$") + "=\\{.+?\\}\\};)";
helperObject = RegexHelper.matchGroup1(helperPattern, playerCode);
callerFunc = callerFunc.replace("%%", decryptionFuncName); callerFunc = callerFunc.replace("%%", decryptionFuncName);
@ -624,25 +544,8 @@ public class YoutubeVideoExtractor extends VideoExtractor {
return (result == null ? "" : result.toString()); return (result == null ? "" : result.toString());
} }
private String cleanUrl(String complexUrl) throws ParsingException {
return getVideoUrl(getVideoId(complexUrl));
}
private String matchGroup1(String pattern, String input) throws RegexException {
Pattern pat = Pattern.compile(pattern);
Matcher mat = pat.matcher(input);
boolean foundMatch = mat.find();
if (foundMatch) {
return mat.group(1);
}
else {
//Log.e(TAG, "failed to find pattern \""+pattern+"\" inside of \""+input+"\"");
throw new RegexException("failed to find pattern \""+pattern+" inside of "+input+"\"");
}
}
private String findErrorReason(Document doc) { private String findErrorReason(Document doc) {
errorMessage = doc.select("h1[id=\"unavailable-message\"]").first().text(); String errorMessage = doc.select("h1[id=\"unavailable-message\"]").first().text();
if(errorMessage.contains("GEMA")) { if(errorMessage.contains("GEMA")) {
// Gema sometimes blocks youtube music content in germany: // Gema sometimes blocks youtube music content in germany:
// https://www.gema.de/en/ // https://www.gema.de/en/