prosiebensat1.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. # encoding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from hashlib import sha1
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. compat_urllib_parse,
  8. unified_strdate,
  9. ExtractorError,
  10. )
  11. class ProSiebenSat1IE(InfoExtractor):
  12. IE_NAME = 'prosiebensat1'
  13. IE_DESC = 'ProSiebenSat.1 Digital'
  14. _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
  15. _TESTS = [
  16. {
  17. 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
  18. 'info_dict': {
  19. 'id': '2104602',
  20. 'ext': 'mp4',
  21. 'title': 'Staffel 2, Episode 18 - Jahresrückblick',
  22. 'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
  23. 'upload_date': '20131231',
  24. 'duration': 5845.04,
  25. },
  26. 'params': {
  27. # rtmp download
  28. 'skip_download': True,
  29. },
  30. },
  31. {
  32. 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html',
  33. 'info_dict': {
  34. 'id': '2570327',
  35. 'ext': 'mp4',
  36. 'title': 'Lady-Umstyling für Audrina',
  37. 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d',
  38. 'upload_date': '20131014',
  39. 'duration': 606.76,
  40. },
  41. 'params': {
  42. # rtmp download
  43. 'skip_download': True,
  44. },
  45. 'skip': 'Seems to be broken',
  46. },
  47. {
  48. 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge',
  49. 'info_dict': {
  50. 'id': '2429369',
  51. 'ext': 'mp4',
  52. 'title': 'Countdown für die Autowerkstatt',
  53. 'description': 'md5:809fc051a457b5d8666013bc40698817',
  54. 'upload_date': '20140223',
  55. 'duration': 2595.04,
  56. },
  57. 'params': {
  58. # rtmp download
  59. 'skip_download': True,
  60. },
  61. },
  62. {
  63. 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
  64. 'info_dict': {
  65. 'id': '2904997',
  66. 'ext': 'mp4',
  67. 'title': 'Sexy laufen in Ugg Boots',
  68. 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6',
  69. 'upload_date': '20140122',
  70. 'duration': 245.32,
  71. },
  72. 'params': {
  73. # rtmp download
  74. 'skip_download': True,
  75. },
  76. },
  77. {
  78. 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
  79. 'info_dict': {
  80. 'id': '2906572',
  81. 'ext': 'mp4',
  82. 'title': 'Im Interview: Kai Wiesinger',
  83. 'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
  84. 'upload_date': '20140225',
  85. 'duration': 522.56,
  86. },
  87. 'params': {
  88. # rtmp download
  89. 'skip_download': True,
  90. },
  91. },
  92. {
  93. 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
  94. 'info_dict': {
  95. 'id': '2992323',
  96. 'ext': 'mp4',
  97. 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
  98. 'description': 'md5:2669cde3febe9bce13904f701e774eb6',
  99. 'upload_date': '20140225',
  100. 'duration': 2410.44,
  101. },
  102. 'params': {
  103. # rtmp download
  104. 'skip_download': True,
  105. },
  106. },
  107. {
  108. 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
  109. 'info_dict': {
  110. 'id': '3004256',
  111. 'ext': 'mp4',
  112. 'title': 'Schalke: Tönnies möchte Raul zurück',
  113. 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f',
  114. 'upload_date': '20140226',
  115. 'duration': 228.96,
  116. },
  117. 'params': {
  118. # rtmp download
  119. 'skip_download': True,
  120. },
  121. },
  122. {
  123. 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
  124. 'info_dict': {
  125. 'id': '2572814',
  126. 'ext': 'mp4',
  127. 'title': 'Andreas Kümmert: Rocket Man',
  128. 'description': 'md5:6ddb02b0781c6adf778afea606652e38',
  129. 'upload_date': '20131017',
  130. 'duration': 469.88,
  131. },
  132. 'params': {
  133. # rtmp download
  134. 'skip_download': True,
  135. },
  136. },
  137. {
  138. 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html',
  139. 'info_dict': {
  140. 'id': '2156342',
  141. 'ext': 'mp4',
  142. 'title': 'Kurztrips zum Valentinstag',
  143. 'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.',
  144. 'duration': 307.24,
  145. },
  146. 'params': {
  147. # rtmp download
  148. 'skip_download': True,
  149. },
  150. },
  151. {
  152. 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist',
  153. 'info_dict': {
  154. 'id': '439664',
  155. 'title': 'Episode 8 - Ganze Folge - Playlist',
  156. 'description': 'Das finale und härteste Duell aller Zeiten ist vorbei! Der Weltmeister für dieses Jahr steht! Alle packenden Duelle der achten Episode von "Joko gegen Klaas - das Duell um die Welt" seht ihr hier noch einmal in voller Länge!',
  157. },
  158. 'playlist_count': 2,
  159. },
  160. ]
  161. _CLIPID_REGEXES = [
  162. r'"clip_id"\s*:\s+"(\d+)"',
  163. r'clipid: "(\d+)"',
  164. r'clip[iI]d=(\d+)',
  165. ]
  166. _TITLE_REGEXES = [
  167. r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
  168. r'<header class="clearfix">\s*<h3>(.+?)</h3>',
  169. r'<!-- start video -->\s*<h1>(.+?)</h1>',
  170. r'<h1 class="att-name">\s*(.+?)</h1>',
  171. ]
  172. _DESCRIPTION_REGEXES = [
  173. r'<p itemprop="description">\s*(.+?)</p>',
  174. r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
  175. r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
  176. r'<p class="att-description">\s*(.+?)\s*</p>',
  177. ]
  178. _UPLOAD_DATE_REGEXES = [
  179. r'<meta property="og:published_time" content="(.+?)">',
  180. r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"',
  181. r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr',
  182. r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
  183. r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
  184. ]
  185. _ITEM_TYPE_REGEXES = [
  186. r"'itemType'\s*:\s*'([^']*)'",
  187. ]
  188. _ITEM_ID_REGEXES = [
  189. r"'itemId'\s*:\s*'([^']*)'",
  190. ]
  191. _PLAYLIST_CLIPS_REGEXES = [
  192. r'data-qvt=.+?<a href="([^"]+)"',
  193. ]
  194. def _real_extract(self, url):
  195. video_id = self._match_id(url)
  196. webpage = self._download_webpage(url, video_id)
  197. item_type = self._html_search_regex(self._ITEM_TYPE_REGEXES, webpage, 'item type', default='CLIP')
  198. if item_type == 'CLIP':
  199. return self._clip_extract(url, webpage)
  200. elif item_type == 'PLAYLIST':
  201. playlist_id = self._html_search_regex(self._ITEM_ID_REGEXES, webpage, 'playlist id')
  202. for regex in self._PLAYLIST_CLIPS_REGEXES:
  203. playlist_clips = re.findall(regex, webpage, re.DOTALL)
  204. if playlist_clips:
  205. title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
  206. description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
  207. root_url = re.match('(.+?//.+?)/', url).group(1)
  208. return {
  209. '_type': 'playlist',
  210. 'id': playlist_id,
  211. 'title': title,
  212. 'description': description,
  213. 'entries': [self._clip_extract(root_url + clip_path) for clip_path in playlist_clips]
  214. }
  215. else:
  216. raise ExtractorError('Unknown item type "%s"' % item_type)
  217. def _clip_extract(self, url, webpage=None):
  218. if webpage is None:
  219. video_id = self._match_id(url)
  220. webpage = self._download_webpage(url, video_id)
  221. clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
  222. access_token = 'testclient'
  223. client_name = 'kolibri-1.2.5'
  224. client_location = url
  225. videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
  226. 'access_token': access_token,
  227. 'client_location': client_location,
  228. 'client_name': client_name,
  229. 'ids': clip_id,
  230. })
  231. videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')
  232. duration = float(videos[0]['duration'])
  233. source_ids = [source['id'] for source in videos[0]['sources']]
  234. source_ids_str = ','.join(map(str, source_ids))
  235. g = '01!8d8F_)r9]4s[qeuXfP%'
  236. client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
  237. .encode('utf-8')).hexdigest()
  238. sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({
  239. 'access_token': access_token,
  240. 'client_id': client_id,
  241. 'client_location': client_location,
  242. 'client_name': client_name,
  243. }))
  244. sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON')
  245. server_id = sources['server_id']
  246. client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id,
  247. client_location, source_ids_str, g, client_name])
  248. .encode('utf-8')).hexdigest()
  249. url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({
  250. 'access_token': access_token,
  251. 'client_id': client_id,
  252. 'client_location': client_location,
  253. 'client_name': client_name,
  254. 'server_id': server_id,
  255. 'source_ids': source_ids_str,
  256. }))
  257. urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
  258. title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
  259. description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
  260. thumbnail = self._og_search_thumbnail(webpage)
  261. upload_date = unified_strdate(self._html_search_regex(
  262. self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
  263. formats = []
  264. urls_sources = urls['sources']
  265. if isinstance(urls_sources, dict):
  266. urls_sources = urls_sources.values()
  267. def fix_bitrate(bitrate):
  268. return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
  269. for source in urls_sources:
  270. protocol = source['protocol']
  271. if protocol == 'rtmp' or protocol == 'rtmpe':
  272. mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
  273. if not mobj:
  274. continue
  275. formats.append({
  276. 'url': mobj.group('url'),
  277. 'app': mobj.group('app'),
  278. 'play_path': mobj.group('playpath'),
  279. 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
  280. 'page_url': 'http://www.prosieben.de',
  281. 'vbr': fix_bitrate(source['bitrate']),
  282. 'ext': 'mp4',
  283. 'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
  284. })
  285. else:
  286. formats.append({
  287. 'url': source['url'],
  288. 'vbr': fix_bitrate(source['bitrate']),
  289. })
  290. self._sort_formats(formats)
  291. return {
  292. 'id': clip_id,
  293. 'title': title,
  294. 'description': description,
  295. 'thumbnail': thumbnail,
  296. 'upload_date': upload_date,
  297. 'duration': duration,
  298. 'formats': formats,
  299. }