youporn.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. extract_attributes,
  7. int_or_none,
  8. str_to_int,
  9. merge_dicts,
  10. T,
  11. traverse_obj,
  12. unified_strdate,
  13. url_or_none,
  14. )
  15. class YouPornIE(InfoExtractor):
  16. _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
  17. _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)']
  18. _TESTS = [{
  19. 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
  20. 'md5': '3744d24c50438cf5b6f6d59feb5055c2',
  21. 'info_dict': {
  22. 'id': '505835',
  23. 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily',
  24. 'ext': 'mp4',
  25. 'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
  26. 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
  27. 'thumbnail': r're:^https?://.*\.jpg$',
  28. 'duration': 210,
  29. 'uploader': 'Ask Dan And Jennifer',
  30. 'upload_date': '20101217',
  31. 'average_rating': int,
  32. 'view_count': int,
  33. 'categories': list,
  34. 'tags': list,
  35. 'age_limit': 18,
  36. },
  37. 'skip': 'This video has been disabled',
  38. }, {
  39. # Unknown uploader
  40. 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4',
  41. 'info_dict': {
  42. 'id': '561726',
  43. 'display_id': 'big-tits-awesome-brunette-on-amazing-webcam-show',
  44. 'ext': 'mp4',
  45. 'title': 'Big Tits Awesome Brunette On amazing webcam show',
  46. 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4',
  47. 'thumbnail': r're:^https?://.*\.jpg$',
  48. 'uploader': 'Unknown',
  49. 'upload_date': '20110418',
  50. 'average_rating': int,
  51. 'view_count': int,
  52. 'categories': list,
  53. 'tags': list,
  54. 'age_limit': 18,
  55. },
  56. 'params': {
  57. 'skip_download': True,
  58. },
  59. 'skip': '404',
  60. }, {
  61. 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/',
  62. 'only_matching': True,
  63. }, {
  64. 'url': 'http://www.youporn.com/watch/505835',
  65. 'only_matching': True,
  66. }, {
  67. 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/',
  68. 'only_matching': True,
  69. }, {
  70. 'url': 'https://www.youporn.com/watch/16290308/tinderspecial-trailer1/',
  71. 'info_dict': {
  72. 'id': '46949121',
  73. 'age_limit': 18,
  74. 'categories': [],
  75. 'description': None, # SEO spam using title removed
  76. 'display_id': 'tinderspecial-trailer1',
  77. 'duration': 298.0,
  78. 'ext': 'mp4',
  79. 'upload_date': '20201123',
  80. 'uploader': 'Ersties',
  81. 'tags': [],
  82. 'thumbnail': 'https://fi1.ypncdn.com/m=eaSaaTbWx/202011/23/16290308/original/3.jpg',
  83. 'timestamp': 1606147564,
  84. 'title': 'Tinder In Real Life',
  85. 'view_count': int,
  86. }
  87. }]
  88. @classmethod
  89. def _extract_urls(cls, webpage):
  90. def yield_urls():
  91. for p in cls._EMBED_REGEX:
  92. for m in re.finditer(p, webpage):
  93. yield m.group('url')
  94. return list(yield_urls())
  95. def _real_extract(self, url):
  96. display_id = self._match_valid_url(url).group('id', 'display_id')
  97. url = 'http://www.youporn.com/watch/%s' % (display_id[0],)
  98. display_id = display_id[1] or display_id[0]
  99. webpage = self._download_webpage(
  100. url, display_id, headers={'Cookie': 'age_verified=1'})
  101. video_id = display_id[0]
  102. playervars = self._search_json(
  103. r'\bplayervars\s*:', webpage, 'playervars', display_id)
  104. def get_fmt(x):
  105. v_url = url_or_none(x.get('videoUrl'))
  106. if v_url:
  107. x['videoUrl'] = v_url
  108. return (x['format'], x)
  109. defs_by_format = dict(traverse_obj(playervars, (
  110. 'mediaDefinitions', lambda _, v: v.get('format'), T(get_fmt))))
  111. def get_format_data(f):
  112. if f not in defs_by_format:
  113. return []
  114. return self._download_json(
  115. defs_by_format[f]['videoUrl'], display_id, '{0}-formats'.format(f))
  116. formats = []
  117. # Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
  118. for hls_url in traverse_obj(
  119. get_format_data('hls'),
  120. (lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'),
  121. (Ellipsis, 'videoUrl')):
  122. formats.extend(self._extract_m3u8_formats(
  123. hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls',
  124. entry_protocol='m3u8_native'))
  125. for f in traverse_obj(get_format_data('mp4'), (
  126. lambda _, v: v.get('videoUrl'), {
  127. 'url': ('videoUrl', T(url_or_none)),
  128. 'filesize': ('videoSize', T(int_or_none)),
  129. 'height': ('quality', T(int_or_none)),
  130. }, T(lambda x: x.get('videoUrl') and x))):
  131. # Video URL's path looks like this:
  132. # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
  133. # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
  134. # /videos/201703/11/109285532/1080P_4000K_109285532.mp4
  135. # We will benefit from it by extracting some metadata
  136. mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', f['videoUrl'])
  137. if mobj:
  138. if not f.get('height'):
  139. f['height'] = int(mobj.group('height'))
  140. f['tbr'] = int(mobj.group('bitrate'))
  141. f['format_id'] = '%dp-%dk' % (f['height'], f['tbr'])
  142. formats.append(f)
  143. self._sort_formats(formats)
  144. title = self._html_search_regex(
  145. r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
  146. webpage, 'title', default=None) or self._og_search_title(
  147. webpage, default=None) or self._html_search_meta(
  148. 'title', webpage, fatal=True)
  149. description = self._html_search_regex(
  150. r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>',
  151. webpage, 'description',
  152. default=None) or self._og_search_description(
  153. webpage, default=None)
  154. thumbnail = self._search_regex(
  155. r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',
  156. webpage, 'thumbnail', fatal=False, group='thumbnail')
  157. duration = int_or_none(self._html_search_meta(
  158. 'video:duration', webpage, 'duration', fatal=False))
  159. uploader = self._html_search_regex(
  160. r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
  161. webpage, 'uploader', fatal=False)
  162. upload_date = unified_strdate(self._html_search_regex(
  163. (r'UPLOADED:\s*<span>([^<]+)',
  164. r'Date\s+[Aa]dded:\s*<span>([^<]+)',
  165. r'''(?s)<div[^>]+class=["']videoInfo(?:Date|Time)\b[^>]*>(.+?)</div>''',
  166. r'(?s)<label\b[^>]*>Uploaded[^<]*</label>\s*<span\b[^>]*>(.+?)</span>'),
  167. webpage, 'upload date', fatal=False))
  168. age_limit = self._rta_search(webpage)
  169. view_count = None
  170. views = self._search_regex(
  171. r'(<div[^>]+\bclass=["\']js_videoInfoViews["\']>)', webpage,
  172. 'views', default=None)
  173. if views:
  174. view_count = str_to_int(extract_attributes(views).get('data-value'))
  175. comment_count = str_to_int(self._search_regex(
  176. r'>All [Cc]omments? \(([\d,.]+)\)',
  177. webpage, 'comment count', default=None))
  178. def extract_tag_box(regex, title):
  179. tag_box = self._search_regex(regex, webpage, title, default=None)
  180. if not tag_box:
  181. return []
  182. return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box)
  183. categories = extract_tag_box(
  184. r'(?s)Categories:.*?</[^>]+>(.+?)</div>', 'categories')
  185. tags = extract_tag_box(
  186. r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>',
  187. 'tags')
  188. data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False) or {}
  189. data.pop('url', None)
  190. result = merge_dicts(data, {
  191. 'id': video_id,
  192. 'display_id': display_id if display_id != video_id else None,
  193. 'title': title,
  194. 'description': description,
  195. 'thumbnail': thumbnail,
  196. 'duration': duration,
  197. 'uploader': uploader,
  198. 'upload_date': upload_date,
  199. 'view_count': view_count,
  200. 'comment_count': comment_count,
  201. 'categories': categories,
  202. 'tags': tags,
  203. 'age_limit': age_limit,
  204. 'formats': formats,
  205. })
  206. return result