dailymotion.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import base64
  4. import functools
  5. import hashlib
  6. import itertools
  7. import json
  8. import random
  9. import re
  10. import string
  11. from .common import InfoExtractor
  12. from ..compat import compat_struct_pack
  13. from ..utils import (
  14. determine_ext,
  15. error_to_compat_str,
  16. ExtractorError,
  17. int_or_none,
  18. mimetype2ext,
  19. OnDemandPagedList,
  20. parse_iso8601,
  21. sanitized_Request,
  22. str_to_int,
  23. unescapeHTML,
  24. urlencode_postdata,
  25. try_get,
  26. )
  27. class DailymotionBaseInfoExtractor(InfoExtractor):
  28. @staticmethod
  29. def _build_request(url):
  30. """Build a request with the family filter disabled"""
  31. request = sanitized_Request(url)
  32. request.add_header('Cookie', 'family_filter=off; ff=off')
  33. return request
  34. def _download_webpage_handle_no_ff(self, url, *args, **kwargs):
  35. request = self._build_request(url)
  36. return self._download_webpage_handle(request, *args, **kwargs)
  37. def _download_webpage_no_ff(self, url, *args, **kwargs):
  38. request = self._build_request(url)
  39. return self._download_webpage(request, *args, **kwargs)
  40. class DailymotionIE(DailymotionBaseInfoExtractor):
  41. _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)'
  42. IE_NAME = 'dailymotion'
  43. _FORMATS = [
  44. ('stream_h264_ld_url', 'ld'),
  45. ('stream_h264_url', 'standard'),
  46. ('stream_h264_hq_url', 'hq'),
  47. ('stream_h264_hd_url', 'hd'),
  48. ('stream_h264_hd1080_url', 'hd180'),
  49. ]
  50. _TESTS = [{
  51. 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
  52. 'md5': '074b95bdee76b9e3654137aee9c79dfe',
  53. 'info_dict': {
  54. 'id': 'x5kesuj',
  55. 'ext': 'mp4',
  56. 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller',
  57. 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller',
  58. 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
  59. 'duration': 187,
  60. 'timestamp': 1493651285,
  61. 'upload_date': '20170501',
  62. 'uploader': 'Deadline',
  63. 'uploader_id': 'x1xm8ri',
  64. 'age_limit': 0,
  65. },
  66. }, {
  67. 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
  68. 'md5': '2137c41a8e78554bb09225b8eb322406',
  69. 'info_dict': {
  70. 'id': 'x2iuewm',
  71. 'ext': 'mp4',
  72. 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
  73. 'description': 'Several come bundled with the Steam Controller.',
  74. 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
  75. 'duration': 74,
  76. 'timestamp': 1425657362,
  77. 'upload_date': '20150306',
  78. 'uploader': 'IGN',
  79. 'uploader_id': 'xijv66',
  80. 'age_limit': 0,
  81. 'view_count': int,
  82. },
  83. 'skip': 'video gone',
  84. }, {
  85. # Vevo video
  86. 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
  87. 'info_dict': {
  88. 'title': 'Roar (Official)',
  89. 'id': 'USUV71301934',
  90. 'ext': 'mp4',
  91. 'uploader': 'Katy Perry',
  92. 'upload_date': '20130905',
  93. },
  94. 'params': {
  95. 'skip_download': True,
  96. },
  97. 'skip': 'VEVO is only available in some countries',
  98. }, {
  99. # age-restricted video
  100. 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
  101. 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
  102. 'info_dict': {
  103. 'id': 'xyh2zz',
  104. 'ext': 'mp4',
  105. 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
  106. 'uploader': 'HotWaves1012',
  107. 'age_limit': 18,
  108. },
  109. 'skip': 'video gone',
  110. }, {
  111. # geo-restricted, player v5
  112. 'url': 'http://www.dailymotion.com/video/xhza0o',
  113. 'only_matching': True,
  114. }, {
  115. # with subtitles
  116. 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
  117. 'only_matching': True,
  118. }, {
  119. 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
  120. 'only_matching': True,
  121. }, {
  122. 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun',
  123. 'only_matching': True,
  124. }]
  125. @staticmethod
  126. def _extract_urls(webpage):
  127. # Look for embedded Dailymotion player
  128. matches = re.findall(
  129. r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
  130. return list(map(lambda m: unescapeHTML(m[1]), matches))
  131. def _real_extract(self, url):
  132. video_id = self._match_id(url)
  133. webpage = self._download_webpage_no_ff(
  134. 'https://www.dailymotion.com/video/%s' % video_id, video_id)
  135. age_limit = self._rta_search(webpage)
  136. description = self._og_search_description(
  137. webpage, default=None) or self._html_search_meta(
  138. 'description', webpage, 'description')
  139. view_count_str = self._search_regex(
  140. (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',
  141. r'video_views_count[^>]+>\s+([\s\d\,.]+)'),
  142. webpage, 'view count', default=None)
  143. if view_count_str:
  144. view_count_str = re.sub(r'\s', '', view_count_str)
  145. view_count = str_to_int(view_count_str)
  146. comment_count = int_or_none(self._search_regex(
  147. r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"',
  148. webpage, 'comment count', default=None))
  149. player_v5 = self._search_regex(
  150. [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826
  151. r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
  152. r'buildPlayer\(({.+?})\);',
  153. r'var\s+config\s*=\s*({.+?});',
  154. # New layout regex (see https://github.com/rg3/youtube-dl/issues/13580)
  155. r'__PLAYER_CONFIG__\s*=\s*({.+?});'],
  156. webpage, 'player v5', default=None)
  157. if player_v5:
  158. player = self._parse_json(player_v5, video_id)
  159. metadata = try_get(
  160. player, lambda x: x['metadata'], dict) or self._download_json(
  161. 'http://www.dailymotion.com/player/metadata/video/%s' % video_id, video_id, query={
  162. 'integration': 'inline',
  163. 'GK_PV5_NEON': '1',
  164. })
  165. if metadata.get('error', {}).get('type') == 'password_protected':
  166. password = self._downloader.params.get('videopassword')
  167. if password:
  168. r = int(metadata['id'][1:], 36)
  169. us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=')
  170. t = ''.join(random.choice(string.ascii_letters) for i in range(10))
  171. n = us64e(compat_struct_pack('I', r))
  172. i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest())
  173. metadata = self._download_json(
  174. 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id)
  175. self._check_error(metadata)
  176. formats = []
  177. for quality, media_list in metadata['qualities'].items():
  178. for media in media_list:
  179. media_url = media.get('url')
  180. if not media_url:
  181. continue
  182. type_ = media.get('type')
  183. if type_ == 'application/vnd.lumberjack.manifest':
  184. continue
  185. ext = mimetype2ext(type_) or determine_ext(media_url)
  186. if ext == 'm3u8':
  187. m3u8_formats = self._extract_m3u8_formats(
  188. media_url, video_id, 'mp4', preference=-1,
  189. m3u8_id='hls', fatal=False)
  190. for f in m3u8_formats:
  191. f['url'] = f['url'].split('#')[0]
  192. formats.append(f)
  193. elif ext == 'f4m':
  194. formats.extend(self._extract_f4m_formats(
  195. media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
  196. else:
  197. f = {
  198. 'url': media_url,
  199. 'format_id': 'http-%s' % quality,
  200. 'ext': ext,
  201. }
  202. m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
  203. if m:
  204. f.update({
  205. 'width': int(m.group('width')),
  206. 'height': int(m.group('height')),
  207. })
  208. formats.append(f)
  209. self._sort_formats(formats)
  210. title = metadata['title']
  211. duration = int_or_none(metadata.get('duration'))
  212. timestamp = int_or_none(metadata.get('created_time'))
  213. thumbnail = metadata.get('poster_url')
  214. uploader = metadata.get('owner', {}).get('screenname')
  215. uploader_id = metadata.get('owner', {}).get('id')
  216. subtitles = {}
  217. subtitles_data = metadata.get('subtitles', {}).get('data', {})
  218. if subtitles_data and isinstance(subtitles_data, dict):
  219. for subtitle_lang, subtitle in subtitles_data.items():
  220. subtitles[subtitle_lang] = [{
  221. 'ext': determine_ext(subtitle_url),
  222. 'url': subtitle_url,
  223. } for subtitle_url in subtitle.get('urls', [])]
  224. return {
  225. 'id': video_id,
  226. 'title': title,
  227. 'description': description,
  228. 'thumbnail': thumbnail,
  229. 'duration': duration,
  230. 'timestamp': timestamp,
  231. 'uploader': uploader,
  232. 'uploader_id': uploader_id,
  233. 'age_limit': age_limit,
  234. 'view_count': view_count,
  235. 'comment_count': comment_count,
  236. 'formats': formats,
  237. 'subtitles': subtitles,
  238. }
  239. # vevo embed
  240. vevo_id = self._search_regex(
  241. r'<link rel="video_src" href="[^"]*?vevo\.com[^"]*?video=(?P<id>[\w]*)',
  242. webpage, 'vevo embed', default=None)
  243. if vevo_id:
  244. return self.url_result('vevo:%s' % vevo_id, 'Vevo')
  245. # fallback old player
  246. embed_page = self._download_webpage_no_ff(
  247. 'https://www.dailymotion.com/embed/video/%s' % video_id,
  248. video_id, 'Downloading embed page')
  249. timestamp = parse_iso8601(self._html_search_meta(
  250. 'video:release_date', webpage, 'upload date'))
  251. info = self._parse_json(
  252. self._search_regex(
  253. r'var info = ({.*?}),$', embed_page,
  254. 'video info', flags=re.MULTILINE),
  255. video_id)
  256. self._check_error(info)
  257. formats = []
  258. for (key, format_id) in self._FORMATS:
  259. video_url = info.get(key)
  260. if video_url is not None:
  261. m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
  262. if m_size is not None:
  263. width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
  264. else:
  265. width, height = None, None
  266. formats.append({
  267. 'url': video_url,
  268. 'ext': 'mp4',
  269. 'format_id': format_id,
  270. 'width': width,
  271. 'height': height,
  272. })
  273. self._sort_formats(formats)
  274. # subtitles
  275. video_subtitles = self.extract_subtitles(video_id, webpage)
  276. title = self._og_search_title(webpage, default=None)
  277. if title is None:
  278. title = self._html_search_regex(
  279. r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage,
  280. 'title')
  281. return {
  282. 'id': video_id,
  283. 'formats': formats,
  284. 'uploader': info['owner.screenname'],
  285. 'timestamp': timestamp,
  286. 'title': title,
  287. 'description': description,
  288. 'subtitles': video_subtitles,
  289. 'thumbnail': info['thumbnail_url'],
  290. 'age_limit': age_limit,
  291. 'view_count': view_count,
  292. 'duration': info['duration']
  293. }
  294. def _check_error(self, info):
  295. error = info.get('error')
  296. if error:
  297. title = error.get('title') or error['message']
  298. # See https://developer.dailymotion.com/api#access-error
  299. if error.get('code') == 'DM007':
  300. self.raise_geo_restricted(msg=title)
  301. raise ExtractorError(
  302. '%s said: %s' % (self.IE_NAME, title), expected=True)
  303. def _get_subtitles(self, video_id, webpage):
  304. try:
  305. sub_list = self._download_webpage(
  306. 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
  307. video_id, note=False)
  308. except ExtractorError as err:
  309. self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
  310. return {}
  311. info = json.loads(sub_list)
  312. if (info['total'] > 0):
  313. sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list'])
  314. return sub_lang_list
  315. self._downloader.report_warning('video doesn\'t have subtitles')
  316. return {}
  317. class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
  318. IE_NAME = 'dailymotion:playlist'
  319. _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'
  320. _TESTS = [{
  321. 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
  322. 'info_dict': {
  323. 'title': 'SPORT',
  324. 'id': 'xv4bw',
  325. },
  326. 'playlist_mincount': 20,
  327. }]
  328. _PAGE_SIZE = 100
  329. def _fetch_page(self, playlist_id, authorizaion, page):
  330. page += 1
  331. videos = self._download_json(
  332. 'https://graphql.api.dailymotion.com',
  333. playlist_id, 'Downloading page %d' % page,
  334. data=json.dumps({
  335. 'query': '''{
  336. collection(xid: "%s") {
  337. videos(first: %d, page: %d) {
  338. pageInfo {
  339. hasNextPage
  340. nextPage
  341. }
  342. edges {
  343. node {
  344. xid
  345. url
  346. }
  347. }
  348. }
  349. }
  350. }''' % (playlist_id, self._PAGE_SIZE, page)
  351. }).encode(), headers={
  352. 'Authorization': authorizaion,
  353. 'Origin': 'https://www.dailymotion.com',
  354. })['data']['collection']['videos']
  355. for edge in videos['edges']:
  356. node = edge['node']
  357. yield self.url_result(
  358. node['url'], DailymotionIE.ie_key(), node['xid'])
  359. def _real_extract(self, url):
  360. playlist_id = self._match_id(url)
  361. webpage = self._download_webpage(url, playlist_id)
  362. api = self._parse_json(self._search_regex(
  363. r'__PLAYER_CONFIG__\s*=\s*({.+?});',
  364. webpage, 'player config'), playlist_id)['context']['api']
  365. auth = self._download_json(
  366. api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'),
  367. playlist_id, data=urlencode_postdata({
  368. 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'),
  369. 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'),
  370. 'grant_type': 'client_credentials',
  371. }))
  372. authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token'])
  373. entries = OnDemandPagedList(functools.partial(
  374. self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE)
  375. return self.playlist_result(
  376. entries, playlist_id,
  377. self._og_search_title(webpage))
  378. class DailymotionUserIE(DailymotionBaseInfoExtractor):
  379. IE_NAME = 'dailymotion:user'
  380. _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
  381. _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
  382. _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
  383. _TESTS = [{
  384. 'url': 'https://www.dailymotion.com/user/nqtv',
  385. 'info_dict': {
  386. 'id': 'nqtv',
  387. 'title': 'Rémi Gaillard',
  388. },
  389. 'playlist_mincount': 100,
  390. }, {
  391. 'url': 'http://www.dailymotion.com/user/UnderProject',
  392. 'info_dict': {
  393. 'id': 'UnderProject',
  394. 'title': 'UnderProject',
  395. },
  396. 'playlist_mincount': 1800,
  397. 'expected_warnings': [
  398. 'Stopped at duplicated page',
  399. ],
  400. 'skip': 'Takes too long time',
  401. }]
  402. def _extract_entries(self, id):
  403. video_ids = set()
  404. processed_urls = set()
  405. for pagenum in itertools.count(1):
  406. page_url = self._PAGE_TEMPLATE % (id, pagenum)
  407. webpage, urlh = self._download_webpage_handle_no_ff(
  408. page_url, id, 'Downloading page %s' % pagenum)
  409. if urlh.geturl() in processed_urls:
  410. self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
  411. page_url, urlh.geturl()), id)
  412. break
  413. processed_urls.add(urlh.geturl())
  414. for video_id in re.findall(r'data-xid="(.+?)"', webpage):
  415. if video_id not in video_ids:
  416. yield self.url_result(
  417. 'http://www.dailymotion.com/video/%s' % video_id,
  418. DailymotionIE.ie_key(), video_id)
  419. video_ids.add(video_id)
  420. if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
  421. break
  422. def _real_extract(self, url):
  423. mobj = re.match(self._VALID_URL, url)
  424. user = mobj.group('user')
  425. webpage = self._download_webpage(
  426. 'https://www.dailymotion.com/user/%s' % user, user)
  427. full_user = unescapeHTML(self._html_search_regex(
  428. r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
  429. webpage, 'user'))
  430. return {
  431. '_type': 'playlist',
  432. 'id': user,
  433. 'title': full_user,
  434. 'entries': self._extract_entries(user),
  435. }