from bs4 import BeautifulSoup import urllib2 import socket try: import urlparse except ImportError: from urllib import parse as urlparse # python 3 @UnusedImport directory = '/Users/cuthbert/Desktop/PowerRankingsMLB/' urlBase = 'http://espn.go.com/mlb/powerrankings/_/year/' def getPowerRankingsForYear(year): urlBaseYear = urlBase + str(year) + '/week/' minWeek = 1 if year == 2005: minWeek = 16 maxWeek = 26 if year == 2014: maxWeek = 10 for wk in range(minWeek, maxWeek + 1): urlBaseWeek = urlBaseYear + str(wk) outFP = directory + str(year) + '-' + str(wk) + '.html' html = getHTMLfromURL(urlBaseWeek) with open(outFP, 'w') as f: f.write(html) print urlBaseWeek #print outFP def getHTMLfromURL(url, referer = None): opener = urllib2.build_opener() if referer is None: opener.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1')] else: opener.addheaders = [ ('Referer', referer), ('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1') ] try: fh = opener.open(url, None, 4) # 4 seconds is enough... files should be pretty small allHTML = fh.read() fh.close() return allHTML except urllib2.URLError as unused_e: return None except socket.timeout as unused_e: return None def getHTMLfromFile(fp): with open(fp) as f: x = f.read() return x def getRanks(html): soup = BeautifulSoup(html) allTD = soup.find_all('td') allRanks = [] for td in allTD: try: if 'pr-rank' in td['class']: allRanks.append(td) except KeyError: pass allTuples = [] for rankTd in allRanks: rank = int(rankTd.contents[0]) team = rankTd.next_sibling.contents[1].contents[0].contents[0] record = rankTd.next_sibling.contents[1].contents[2].contents[0] wins, losses = record.split('-') wins = int(wins) losses = int(losses) thisTuple = (rank, team, wins, losses) allTuples.append(thisTuple) return allTuples def calculateOneWeekBias(rankList): bias = {} for teamTuple in rankList: teamBias = 0 rank = teamTuple[0] name = teamTuple[1] winLoss = teamTuple[2] - teamTuple[3] for ttOthers in rankList: if teamTuple == ttOthers: continue # not actually necessary since bias will be zero... orank = ttOthers[0] owinLoss = ttOthers[2] - ttOthers[3] thisBias = 0 ## remember a "higher ranking team" has a lower "rank" score! if winLoss > owinLoss and rank > orank: thisBias = orank - rank elif winLoss < owinLoss and rank < orank: thisBias = orank - rank #print teamTuple, winLoss, ttOthers, owinLoss, ": ", thisBias, rank, orank teamBias += thisBias #print name, teamBias bias[name] = teamBias return bias def runOneWeek(year, week): fp = directory + str(year) + "-" + str(week) + '.html' html = getHTMLfromFile(fp) ranks = getRanks(html) bias = calculateOneWeekBias(ranks) return bias def runOneYear(year): minWeek = 1 if year == 2005: minWeek = 16 maxWeek = 26 if year == 2014: maxWeek = 10 bias = {} for wk in range(minWeek, maxWeek + 1): weekBias = runOneWeek(year, wk) for x in weekBias: if x not in bias: bias[x] = weekBias[x] else: bias[x] += weekBias[x] return bias def runFullDataset(): minYear = 2005 maxYear = 2014 bias = {} for year in range(minYear, maxYear + 1): yearBias = runOneYear(year) for x in yearBias: if x not in bias: bias[x] = yearBias[x] else: bias[x] += yearBias[x] for x in sorted(list(bias.keys())): print x, bias[x] return bias def dlRecordsAll(): for year in range(2005, 2015): dlRecordForYear(year) def dlRecordForYear(year): outFP = directory + 'record-' + str(year) + '.html' urlBase = 'http://espn.go.com/mlb/standings/_/year/' + str(year) #print outFP print urlBase html = getHTMLfromURL(urlBase) with open(outFP, 'w') as f: f.write(html) def extractRecordsForYear(year): fp = directory + 'record-' + str(year) + '.html' html = getHTMLfromFile(fp) soup = BeautifulSoup(html) allTR = soup.find_all('tr') allStats = [] for tr in allTR: try: if 'oddrow' in tr['class'] or 'evenrow' in tr['class']: allStats.append(tr) except KeyError: pass returnStats = [] for statline in allStats: try: name = statline.contents[0].find_all('a')[0].contents[0] except (IndexError, AttributeError) as unused: # Miami -- name change try: name = statline.contents[0].contents[0] name = name.split('-')[-1] # in case Miami clinches HA! except: continue wins = statline.contents[1].contents[0] losses = statline.contents[2].contents[0] recordTuple = (str(name), int(wins), int(losses)) #print recordTuple returnStats.append(recordTuple) return returnStats def extractAllRecords(): allStats = {} for year in range(2005, 2015): yearStats = extractRecordsForYear(year) for teamResult in yearStats: if teamResult[0] not in allStats: allStats[teamResult[0]] = {'wins': 0, 'losses': 0} allStats[teamResult[0]]['wins'] += teamResult[1] allStats[teamResult[0]]['losses'] += teamResult[2] return allStats if __name__ == '__main__': # allStats = extractAllRecords() # for team in sorted(list(allStats.keys())): # allWins = allStats[team]['wins'] # allLosses = allStats[team]['losses'] # allResults = (team, allWins, allLosses, allWins - allLosses, allWins + allLosses) # print allResults #print extractRecordsForYear(2006) #dlRecordsAll() runFullDataset() #for i in range(2006, 2015): # getPowerRankingsForYear(i) #html = getHTMLfromFile('/Users/cuthbert/Desktop/testPR.html') #rl = getRanks(html) #print calculateOneWeekBias(rl)