youtube-dl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # Author: Ricardo Garcia Gonzalez
  4. # License: Public domain code
  5. import htmlentitydefs
  6. import httplib
  7. import math
  8. import netrc
  9. import os
  10. import os.path
  11. import re
  12. import socket
  13. import string
  14. import sys
  15. import time
  16. import urllib
  17. import urllib2
  18. std_headers = {
  19. 'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
  20. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  21. 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  22. 'Accept-Language': 'en-us,en;q=0.5',
  23. }
  24. simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  25. class FileDownloader(object):
  26. """File Downloader class.
  27. File downloader objects are the ones responsible of downloading the
  28. actual video file and writing it to disk if the user has requested
  29. it, among some other tasks. In most cases there should be one per
  30. program. As, given a video URL, the downloader doesn't know how to
  31. extract all the needed information, task that InfoExtractors do, it
  32. has to pass the URL to one of them.
  33. For this, file downloader objects have a method that allows
  34. InfoExtractors to be registered in a given order. When it is passed
  35. a URL, the file downloader handles it to the first InfoExtractor it
  36. finds that reports being able to handle it. The InfoExtractor returns
  37. all the information to the FileDownloader and the latter downloads the
  38. file or does whatever it's instructed to do.
  39. File downloaders accept a lot of parameters. In order not to saturate
  40. the object constructor with arguments, it receives a dictionary of
  41. options instead. These options are available through the get_params()
  42. method for the InfoExtractors to use. The FileDownloader also registers
  43. itself as the downloader in charge for the InfoExtractors that are
  44. added to it, so this is a "mutual registration".
  45. Available options:
  46. username: Username for authentication purposes.
  47. password: Password for authentication purposes.
  48. usenetrc: Use netrc for authentication instead.
  49. quiet: Do not print messages to stdout.
  50. simulate: Do not download the video files.
  51. format: Video format code.
  52. outtmpl: Template for output names.
  53. """
  54. _params = None
  55. _ies = []
  56. def __init__(self, params):
  57. self._ies = []
  58. self.set_params(params)
  59. @staticmethod
  60. def pmkdir(filename):
  61. """Create directory components in filename. Similar to Unix "mkdir -p"."""
  62. components = filename.split(os.sep)
  63. aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  64. for dir in aggregate:
  65. if not os.path.exists(dir):
  66. os.mkdir(dir)
  67. @staticmethod
  68. def format_bytes(bytes):
  69. if bytes is None:
  70. return 'N/A'
  71. if bytes == 0:
  72. exponent = 0
  73. else:
  74. exponent = long(math.log(float(bytes), 1024.0))
  75. suffix = 'bkMGTPEZY'[exponent]
  76. converted = float(bytes) / float(1024**exponent)
  77. return '%.2f%s' % (converted, suffix)
  78. @staticmethod
  79. def calc_percent(byte_counter, data_len):
  80. if data_len is None:
  81. return '---.-%'
  82. return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  83. @staticmethod
  84. def calc_eta(start, now, total, current):
  85. if total is None:
  86. return '--:--'
  87. dif = now - start
  88. if current == 0 or dif < 0.001: # One millisecond
  89. return '--:--'
  90. rate = float(current) / dif
  91. eta = long((float(total) - float(current)) / rate)
  92. (eta_mins, eta_secs) = divmod(eta, 60)
  93. if eta_mins > 99:
  94. return '--:--'
  95. return '%02d:%02d' % (eta_mins, eta_secs)
  96. @staticmethod
  97. def calc_speed(start, now, bytes):
  98. dif = now - start
  99. if bytes == 0 or dif < 0.001: # One millisecond
  100. return '%10s' % '---b/s'
  101. return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
  102. @staticmethod
  103. def best_block_size(elapsed_time, bytes):
  104. new_min = max(bytes / 2.0, 1.0)
  105. new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
  106. if elapsed_time < 0.001:
  107. return int(new_max)
  108. rate = bytes / elapsed_time
  109. if rate > new_max:
  110. return int(new_max)
  111. if rate < new_min:
  112. return int(new_min)
  113. return int(rate)
  114. def set_params(self, params):
  115. """Sets parameters."""
  116. if type(params) != dict:
  117. raise ValueError('params: dictionary expected')
  118. self._params = params
  119. def get_params(self):
  120. """Get parameters."""
  121. return self._params
  122. def add_info_extractor(self, ie):
  123. """Add an InfoExtractor object to the end of the list."""
  124. self._ies.append(ie)
  125. ie.set_downloader(self)
  126. def to_stdout(self, message, skip_eol=False):
  127. """Print message to stdout if not in quiet mode."""
  128. if not self._params.get('quiet', False):
  129. sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
  130. sys.stdout.flush()
  131. def to_stderr(self, message):
  132. """Print message to stderr."""
  133. sys.stderr.write('%s\n' % message)
  134. def download(self, url_list):
  135. """Download a given list of URLs."""
  136. for url in url_list:
  137. suitable_found = False
  138. for ie in self._ies:
  139. if not ie.suitable(url):
  140. continue
  141. # Suitable InfoExtractor found
  142. suitable_found = True
  143. results = [x for x in ie.extract(url) if x is not None]
  144. if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
  145. sys.exit('ERROR: fixed output name but more than one file to download')
  146. if self._params.get('simulate', False):
  147. continue
  148. for result in results:
  149. try:
  150. filename = self._params['outtmpl'] % result
  151. except (KeyError), err:
  152. self.to_stderr('ERROR: invalid output template: %s' % str(err))
  153. continue
  154. try:
  155. self.pmkdir(filename)
  156. except (OSError, IOError), err:
  157. self.to_stderr('ERROR: unable to create directories: %s' % str(err))
  158. continue
  159. try:
  160. outstream = open(filename, 'wb')
  161. except (OSError, IOError), err:
  162. self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
  163. continue
  164. try:
  165. self._do_download(outstream, result['url'])
  166. outstream.close()
  167. except (OSError, IOError), err:
  168. self.to_stderr('ERROR: unable to write video data: %s' % str(err))
  169. continue
  170. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  171. self.to_stderr('ERROR: unable to download video data: %s' % str(err))
  172. continue
  173. break
  174. if not suitable_found:
  175. self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
  176. def _do_download(self, stream, url):
  177. request = urllib2.Request(url, None, std_headers)
  178. data = urllib2.urlopen(request)
  179. data_len = data.info().get('Content-length', None)
  180. data_len_str = self.format_bytes(data_len)
  181. byte_counter = 0
  182. block_size = 1024
  183. start = time.time()
  184. while True:
  185. percent_str = self.calc_percent(byte_counter, data_len)
  186. eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
  187. speed_str = self.calc_speed(start, time.time(), byte_counter)
  188. self.to_stdout('\r[download] %s of %s at %s ETA %s' %
  189. (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
  190. before = time.time()
  191. data_block = data.read(block_size)
  192. after = time.time()
  193. data_block_len = len(data_block)
  194. if data_block_len == 0:
  195. break
  196. byte_counter += data_block_len
  197. stream.write(data_block)
  198. block_size = self.best_block_size(after - before, data_block_len)
  199. self.to_stdout('')
  200. if data_len is not None and str(byte_counter) != data_len:
  201. raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
  202. class InfoExtractor(object):
  203. """Information Extractor class.
  204. Information extractors are the classes that, given a URL, extract
  205. information from the video (or videos) the URL refers to. This
  206. information includes the real video URL, the video title and simplified
  207. title, author and others. It is returned in a list of dictionaries when
  208. calling its extract() method. It is a list because a URL can refer to
  209. more than one video (think of playlists). The dictionaries must include
  210. the following fields:
  211. id: Video identifier.
  212. url: Final video URL.
  213. uploader: Nickname of the video uploader.
  214. title: Literal title.
  215. stitle: Simplified title.
  216. ext: Video filename extension.
  217. Subclasses of this one should re-define the _real_initialize() and
  218. _real_extract() methods, as well as the suitable() static method.
  219. Probably, they should also be instantiated and added to the main
  220. downloader.
  221. """
  222. _ready = False
  223. _downloader = None
  224. def __init__(self, downloader=None):
  225. """Constructor. Receives an optional downloader."""
  226. self._ready = False
  227. self.set_downloader(downloader)
  228. @staticmethod
  229. def suitable(url):
  230. """Receives a URL and returns True if suitable for this IE."""
  231. return True
  232. def initialize(self):
  233. """Initializes an instance (login, etc)."""
  234. if not self._ready:
  235. self._real_initialize()
  236. self._ready = True
  237. def extract(self, url):
  238. """Extracts URL information and returns it in list of dicts."""
  239. self.initialize()
  240. return self._real_extract(url)
  241. def set_downloader(self, downloader):
  242. """Sets the downloader for this IE."""
  243. self._downloader = downloader
  244. def to_stdout(self, message):
  245. if self._downloader is None or not self._downloader.get_params().get('quiet', False):
  246. print message
  247. def to_stderr(self, message):
  248. sys.stderr.write('%s\n' % message)
  249. def _real_initialize(self):
  250. """Real initialization process. Redefine in subclasses."""
  251. pass
  252. def _real_extract(self, url):
  253. """Real extraction process. Redefine in subclasses."""
  254. pass
  255. class YoutubeIE(InfoExtractor):
  256. """Information extractor for youtube.com."""
  257. _LOGIN_URL = 'http://www.youtube.com/login?next=/'
  258. _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
  259. _NETRC_MACHINE = 'youtube'
  260. def _real_initialize(self):
  261. if self._downloader is None:
  262. return
  263. username = None
  264. password = None
  265. downloader_params = self._downloader.get_params()
  266. # Attempt to use provided username and password or .netrc data
  267. if downloader_params.get('username', None) is not None:
  268. username = downloader_params['username']
  269. password = downloader_params['password']
  270. elif downloader_params.get('usenetrc', False):
  271. try:
  272. info = netrc.netrc().authenticators(self._NETRC_MACHINE)
  273. if info is not None:
  274. username = info[0]
  275. password = info[2]
  276. else:
  277. raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
  278. except (IOError, netrc.NetrcParseError), err:
  279. self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
  280. return
  281. if username is None:
  282. return
  283. # Log in
  284. login_form = {
  285. 'current_form': 'loginForm',
  286. 'next': '/',
  287. 'action_login': 'Log In',
  288. 'username': username,
  289. 'password': password,
  290. }
  291. request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
  292. try:
  293. self.to_stdout('[youtube] Logging in')
  294. login_results = urllib2.urlopen(request).read()
  295. if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
  296. self.to_stderr('WARNING: Unable to log in: bad username or password')
  297. return
  298. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  299. self.to_stderr('WARNING: Unable to log in: %s' % str(err))
  300. return
  301. # Confirm age
  302. age_form = {
  303. 'next_url': '/',
  304. 'action_confirm': 'Confirm',
  305. }
  306. request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
  307. try:
  308. self.to_stdout('[youtube] Confirming age')
  309. age_results = urllib2.urlopen(request).read()
  310. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  311. sys.exit('ERROR: Unable to confirm age: %s' % str(err))
  312. def _real_extract(self, url):
  313. # Extract video id from URL
  314. mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
  315. if mobj is None:
  316. self.to_stderr('ERROR: Invalid URL: %s' % url)
  317. return [None]
  318. video_id = mobj.group(2)
  319. # Downloader parameters
  320. format_param = None
  321. if self._downloader is not None:
  322. params = self._downloader.get_params()
  323. format_param = params.get('format', None)
  324. # Extension
  325. video_extension = {18: 'mp4'}.get(format_param, 'flv')
  326. # Normalize URL, including format
  327. normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
  328. if format_param is not None:
  329. normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
  330. request = urllib2.Request(normalized_url, None, std_headers)
  331. try:
  332. self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
  333. video_webpage = urllib2.urlopen(request).read()
  334. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  335. sys.exit('ERROR: Unable to download video: %s' % str(err))
  336. self.to_stdout('[youtube] %s: Extracting video information' % video_id)
  337. # "t" param
  338. mobj = re.search(r', "t": "([^"]+)"', video_webpage)
  339. if mobj is None:
  340. self.to_stderr('ERROR: Unable to extract "t" parameter')
  341. return [None]
  342. video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
  343. if format_param is not None:
  344. video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
  345. self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
  346. # uploader
  347. mobj = re.search(r'More From: ([^<]*)<', video_webpage)
  348. if mobj is None:
  349. self.to_stderr('ERROR: Unable to extract uploader nickname')
  350. return [None]
  351. video_uploader = mobj.group(1)
  352. # title
  353. mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
  354. if mobj is None:
  355. self.to_stderr('ERROR: Unable to extract video title')
  356. return [None]
  357. video_title = mobj.group(1).decode('utf-8')
  358. video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
  359. # simplified title
  360. simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
  361. simple_title = simple_title.strip(u'_')
  362. # Return information
  363. return [{
  364. 'id': video_id,
  365. 'url': video_real_url,
  366. 'uploader': video_uploader,
  367. 'title': video_title,
  368. 'stitle': simple_title,
  369. 'ext': video_extension,
  370. }]
  371. if __name__ == '__main__':
  372. try:
  373. # General configuration
  374. urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
  375. urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
  376. # Information extractors
  377. youtube_ie = YoutubeIE()
  378. # File downloader
  379. fd = FileDownloader({
  380. 'usenetrc': False,
  381. 'username': None,
  382. 'password': None,
  383. 'quiet': False,
  384. 'simulate': True,
  385. 'format': None,
  386. 'outtmpl': '%(id)s.%(ext)s'
  387. })
  388. fd.add_info_extractor(youtube_ie)
  389. fd.download([
  390. 'http://www.youtube.com/watch?v=t7qdwI7TVe8',
  391. 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
  392. 'http://www.youtube.com/watch?v=DZRXe1wtC-M',
  393. ])
  394. except KeyboardInterrupt:
  395. sys.exit('\nERROR: Interrupted by user')