tvp.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. determine_ext,
  7. clean_html,
  8. get_element_by_attribute,
  9. ExtractorError,
  10. )
  11. class TVPIE(InfoExtractor):
  12. IE_NAME = 'tvp'
  13. IE_DESC = 'Telewizja Polska'
  14. _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)'
  15. _TESTS = [{
  16. 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
  17. 'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
  18. 'info_dict': {
  19. 'id': '194536',
  20. 'ext': 'mp4',
  21. 'title': 'Czas honoru, odc. 13 – Władek',
  22. 'description': 'md5:437f48b93558370b031740546b696e24',
  23. },
  24. }, {
  25. 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
  26. 'md5': 'b0005b542e5b4de643a9690326ab1257',
  27. 'info_dict': {
  28. 'id': '17916176',
  29. 'ext': 'mp4',
  30. 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
  31. 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
  32. },
  33. }, {
  34. # page id is not the same as video id(#7799)
  35. 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930',
  36. 'md5': '84cd3c8aec4840046e5ab712416b73d0',
  37. 'info_dict': {
  38. 'id': '33908820',
  39. 'ext': 'mp4',
  40. 'title': 'Wiadomości, 28.09.2017, 19:30',
  41. 'description': 'Wydanie główne codziennego serwisu informacyjnego.'
  42. },
  43. 'skip': 'HTTP Error 404: Not Found',
  44. }, {
  45. 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
  46. 'only_matching': True,
  47. }, {
  48. 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
  49. 'only_matching': True,
  50. }, {
  51. 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
  52. 'only_matching': True,
  53. }, {
  54. 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
  55. 'only_matching': True,
  56. }, {
  57. 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
  58. 'only_matching': True,
  59. }, {
  60. 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
  61. 'only_matching': True,
  62. }]
  63. def _real_extract(self, url):
  64. page_id = self._match_id(url)
  65. webpage = self._download_webpage(url, page_id)
  66. video_id = self._search_regex([
  67. r'<iframe[^>]+src="[^"]*?object_id=(\d+)',
  68. r"object_id\s*:\s*'(\d+)'",
  69. r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id)
  70. return {
  71. '_type': 'url_transparent',
  72. 'url': 'tvp:' + video_id,
  73. 'description': self._og_search_description(
  74. webpage, default=None) or self._html_search_meta(
  75. 'description', webpage, default=None),
  76. 'thumbnail': self._og_search_thumbnail(webpage, default=None),
  77. 'ie_key': 'TVPEmbed',
  78. }
  79. class TVPEmbedIE(InfoExtractor):
  80. IE_NAME = 'tvp:embed'
  81. IE_DESC = 'Telewizja Polska'
  82. _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)'
  83. _TESTS = [{
  84. 'url': 'tvp:194536',
  85. 'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
  86. 'info_dict': {
  87. 'id': '194536',
  88. 'ext': 'mp4',
  89. 'title': 'Czas honoru, odc. 13 – Władek',
  90. },
  91. }, {
  92. # not available
  93. 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
  94. 'md5': '8c9cd59d16edabf39331f93bf8a766c7',
  95. 'info_dict': {
  96. 'id': '22670268',
  97. 'ext': 'mp4',
  98. 'title': 'Panorama, 07.12.2015, 15:40',
  99. },
  100. 'skip': 'Transmisja została zakończona lub materiał niedostępny',
  101. }, {
  102. 'url': 'tvp:22670268',
  103. 'only_matching': True,
  104. }]
  105. def _real_extract(self, url):
  106. video_id = self._match_id(url)
  107. webpage = self._download_webpage(
  108. 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
  109. error = self._html_search_regex(
  110. r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>',
  111. webpage, 'error', default=None) or clean_html(
  112. get_element_by_attribute('class', 'msg error', webpage))
  113. if error:
  114. raise ExtractorError('%s said: %s' % (
  115. self.IE_NAME, clean_html(error)), expected=True)
  116. title = self._search_regex(
  117. r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
  118. webpage, 'title', group='title')
  119. series_title = self._search_regex(
  120. r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
  121. webpage, 'series', group='series', default=None)
  122. if series_title:
  123. title = '%s, %s' % (series_title, title)
  124. thumbnail = self._search_regex(
  125. r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
  126. video_url = self._search_regex(
  127. r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
  128. 'formats', group='url', default=None)
  129. if not video_url or 'material_niedostepny.mp4' in video_url:
  130. video_url = self._download_json(
  131. 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
  132. video_id)['video_url']
  133. formats = []
  134. video_url_base = self._search_regex(
  135. r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)',
  136. video_url, 'video base url', default=None)
  137. if video_url_base:
  138. # TODO: <Group> found instead of <AdaptationSet> in MPD manifest.
  139. # It's not mentioned in MPEG-DASH standard. Figure that out.
  140. # formats.extend(self._extract_mpd_formats(
  141. # video_url_base + '.ism/video.mpd',
  142. # video_id, mpd_id='dash', fatal=False))
  143. formats.extend(self._extract_ism_formats(
  144. video_url_base + '.ism/Manifest',
  145. video_id, 'mss', fatal=False))
  146. formats.extend(self._extract_f4m_formats(
  147. video_url_base + '.ism/video.f4m',
  148. video_id, f4m_id='hds', fatal=False))
  149. m3u8_formats = self._extract_m3u8_formats(
  150. video_url_base + '.ism/video.m3u8', video_id,
  151. 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
  152. self._sort_formats(m3u8_formats)
  153. m3u8_formats = list(filter(
  154. lambda f: f.get('vcodec') != 'none', m3u8_formats))
  155. formats.extend(m3u8_formats)
  156. for i, m3u8_format in enumerate(m3u8_formats, 2):
  157. http_url = '%s-%d.mp4' % (video_url_base, i)
  158. if self._is_valid_url(http_url, video_id):
  159. f = m3u8_format.copy()
  160. f.update({
  161. 'url': http_url,
  162. 'format_id': f['format_id'].replace('hls', 'http'),
  163. 'protocol': 'http',
  164. })
  165. formats.append(f)
  166. else:
  167. formats = [{
  168. 'format_id': 'direct',
  169. 'url': video_url,
  170. 'ext': determine_ext(video_url, 'mp4'),
  171. }]
  172. self._sort_formats(formats)
  173. return {
  174. 'id': video_id,
  175. 'title': title,
  176. 'thumbnail': thumbnail,
  177. 'formats': formats,
  178. }
  179. class TVPSeriesIE(InfoExtractor):
  180. IE_NAME = 'tvp:series'
  181. _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'
  182. _TESTS = [{
  183. 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem',
  184. 'info_dict': {
  185. 'title': 'Ogniem i mieczem',
  186. 'id': '4278026',
  187. },
  188. 'playlist_count': 4,
  189. }, {
  190. 'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat',
  191. 'info_dict': {
  192. 'title': 'Boso przez świat',
  193. 'id': '9329207',
  194. },
  195. 'playlist_count': 86,
  196. }]
  197. def _real_extract(self, url):
  198. display_id = self._match_id(url)
  199. webpage = self._download_webpage(url, display_id, tries=5)
  200. title = self._html_search_regex(
  201. r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series')
  202. playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id')
  203. playlist = self._download_webpage(
  204. 'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend'
  205. 'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5,
  206. note='Downloading playlist')
  207. videos_paths = re.findall(
  208. '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
  209. entries = [
  210. self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key())
  211. for v_path in videos_paths]
  212. return {
  213. '_type': 'playlist',
  214. 'id': playlist_id,
  215. 'display_id': display_id,
  216. 'title': title,
  217. 'entries': entries,
  218. }