1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from urllib2 import urlopen
from urllib import unquote, urlretrieve
from math import floor
import sys
import re


class YouTubeException(Exception):
    pass

def is_valid_url(url):
    return re.search('http:\/\/www\.youtube\.com\/watch\?v=[a-zA-Z0-9-_]{11}', url)


def get_quality(downloadLink):
    regex = re.compile('itag=(\d{1,2})')
    # TODO - Translate the itag numbers into the quality names i.e. flv, webm, mp4, etc.
    return regex.findall(downloadLink)[0]


def get_extension(quality):
    extMap = {
            "13": "3gp",
            "17": "3gp",
            "5": "flv",
            "34": "flv",
            "35": "flv",
            "18": "mp4",
            "22": "mp4",
            "37": "mp4",
            "43": "webm",
            "44": "webm",
            "45": "webm",
            "46": "webm",
            "100": "webm",
            "101": "webm",
            "102": "webm",
            "82": "mp4",
            "83": "mp4",
            "84": "mp4",
            "85": "mp4"
        }
    try:
        extension = extMap[quality]
    except:
        extension = "unknown" 
    
    return extension


def get_download_links(url):
    downloadList = []
    
    # check to see that the url is valid
    if not is_valid_url(url):
        raise YouTubeException("Invalid URL Supplied")
        
    # download the youtube page's source
    try:
        rawHTML = urlopen(url).read()
    
        # strip the download links. It's embedded in a JSON array 
        regex = re.compile('(?=map": "url=)(.*)(", "watermark":)')
        matches = regex.search(rawHTML)
        
        # unable to find matches. This shouldn't happen
        if matches == None:
            raise YouTubeException("Unable to parse YouTube's source. Perhaps they changed their layout?")
        
        else:
            # clean up and slice up the download links. 
            matchGroup = unquote(matches.groups(0)[0][7:]).split("url=")
            urllist  = filter(None,matchGroup)

            # now iterate through each download link and build our dictionary
            for link in urllist:
                quality = get_quality(link)
                link = re.sub('\\\u0026quality.+','',link) # replace uncessary query string
                
                downloadList.append({'format':quality,'url':link})
            
            return downloadList
    
    except Exception as e:
        raise YouTubeException("An error occured. %s" % e.message) 


def progressHandler(p1,p2,p3):
    percent = floor((p1*p2 * 100)/p3)
    sys.stdout.write("%d percent downloaded...\r" % percent)
    sys.stdout.flush()

def downloadVideo(url,fname):
        dest =  '%s.%s' % (fname,get_extension(url['format']))
        urlretrieve(url['url'],dest,progressHandler)


def main():        
    
    try: 
        url = sys.argv[1]
        filename = sys.argv[2]

        print "Obtaining download links... "
        dList = get_download_links(sys.argv[1])
        extension = get_extension(dList[0]['format'])
            
        print "Downloading the highest available format (%s)... " % extension
        # we want to download the highest quality, so the first element would be it
        downloadVideo(dList[0],filename)
            
    except IndexError:
        print "Missing argument. No YouTube url supplied. \n"
        sys.exit(1)
        
    except YouTubeException as e:
        print "Error: %s" % e.message
        sys.exit(1)
        
if __name__ == "__main__":
    main()