tvnet.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import compat_str
  6. from ..utils import (
  7. int_or_none,
  8. unescapeHTML,
  9. )
  10. class TVNetIE(InfoExtractor):
  11. _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?P<id>[0-9]+)'
  12. _TESTS = [{
  13. # video
  14. 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h',
  15. 'md5': 'b4d7abe0252c9b47774760b7519c7558',
  16. 'info_dict': {
  17. 'id': '109788',
  18. 'ext': 'mp4',
  19. 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang',
  20. 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
  21. 'is_live': False,
  22. 'view_count': int,
  23. },
  24. }, {
  25. # audio
  26. 'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi',
  27. 'md5': 'b5875ce9b0a2eecde029216d0e6db2ae',
  28. 'info_dict': {
  29. 'id': '27017',
  30. 'ext': 'm4a',
  31. 'title': 'VOV1 - Bản tin chiều (10/06/2018)',
  32. 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
  33. 'is_live': False,
  34. },
  35. }, {
  36. # live stream
  37. 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1',
  38. 'info_dict': {
  39. 'id': '1011',
  40. 'ext': 'mp4',
  41. 'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  42. 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
  43. 'is_live': True,
  44. },
  45. 'params': {
  46. 'skip_download': True,
  47. },
  48. }, {
  49. # radio live stream
  50. 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014',
  51. 'info_dict': {
  52. 'id': '1014',
  53. 'ext': 'm4a',
  54. 'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  55. 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
  56. 'is_live': True,
  57. },
  58. 'params': {
  59. 'skip_download': True,
  60. },
  61. }]
  62. def _real_extract(self, url):
  63. video_id = self._match_id(url)
  64. webpage = self._download_webpage(url, video_id)
  65. title = self._og_search_title(
  66. webpage, default=None) or self._html_search_meta(
  67. 'title', webpage, default=None) or self._search_regex(
  68. r'<title>([^<]+)<', webpage, 'title')
  69. title = re.sub(r'\s*-\s*TV Net\s*$', '', title)
  70. if '/video/' in url or '/radio/' in url:
  71. is_live = False
  72. elif '/kenh-truyen-hinh/' in url:
  73. is_live = True
  74. else:
  75. is_live = None
  76. data_file = unescapeHTML(self._search_regex(
  77. r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage,
  78. 'data file', group='url'))
  79. stream_urls = set()
  80. formats = []
  81. for stream in self._download_json(data_file, video_id):
  82. if not isinstance(stream, dict):
  83. continue
  84. stream_url = stream.get('url')
  85. if (stream_url in stream_urls or not stream_url or
  86. not isinstance(stream_url, compat_str)):
  87. continue
  88. stream_urls.add(stream_url)
  89. formats.extend(self._extract_m3u8_formats(
  90. stream_url, video_id, 'mp4',
  91. entry_protocol='m3u8' if is_live else 'm3u8_native',
  92. m3u8_id='hls', fatal=False))
  93. self._sort_formats(formats)
  94. # better support for radio streams
  95. if title.startswith('VOV'):
  96. for f in formats:
  97. f.update({
  98. 'ext': 'm4a',
  99. 'vcodec': 'none',
  100. })
  101. thumbnail = self._og_search_thumbnail(
  102. webpage, default=None) or unescapeHTML(
  103. self._search_regex(
  104. r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage,
  105. 'thumbnail', default=None, group='url'))
  106. if is_live:
  107. title = self._live_title(title)
  108. view_count = int_or_none(self._search_regex(
  109. r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>',
  110. webpage, 'view count', default=None))
  111. return {
  112. 'id': video_id,
  113. 'title': title,
  114. 'thumbnail': thumbnail,
  115. 'is_live': is_live,
  116. 'view_count': view_count,
  117. 'formats': formats,
  118. }