# Python Recommender System
# Bill Whiteley, Stats 252 Homework 5
# Imports
import sys
import time
import os
import stat
import csv
import pydelicious
import datetime
# This function gets all of the url posts for a given url and stores them
# in a file
def dump_urlposts(link='http://www.weigend.com/', output_file='linkdata.csv'):
time.sleep(1)
data = pydelicious.get_urlposts(link)
out = open(output_file, "w")
print 'Writing url www posts to file %s' % output_file
for item in data:
out.write(item.user)
out.write(",")
out.write(item.tags)
out.write(",")
out.write(item.dt)
out.write('\n')
out.close()
time.sleep(1)
data = pydelicious.get_urlposts(link.rstrip('www.WWW.'))
out = open(output_file, "a")
print 'Writing url (-www) posts to file %s' % output_file
for item in data:
out.write(item.user)
out.write(",")
out.write(item.tags)
out.write(",")
out.write(item.dt)
out.write('\n')
out.close()
def write_user_urls(user='whitewi4', output_file='adj_links.csv'):
time.sleep(1)
data = pydelicious.get_userposts(user)
out = open(output_file, "a")
print 'Writing adj links to file for user %s' % user
for item in data:
try:
out.write(str(item.user))
out.write(",")
out.write(str(item.tags))
out.write(",")
out.write(str(item.href))
out.write('\n')
except:
out.write('\r')
out.close()
# Function to query recent links with a given tag
def get_tag_posts(tag='weigend', output_file='common_tag.csv'):
time.sleep(2)
data = pydelicious.get_tagposts(tag)
out = open(output_file, "a")
print 'Writing common of tag %s to file' % tag
for item in data:
try:
out.write(str(item.user))
out.write(",")
out.write(str(item.tags))
out.write(",")
out.write(str(item.href))
out.write(",")
out.write(str(item.dt))
out.write('\n')
except:
out.write('\r')
out.close()
def get_tag_popular(tag='weigend', output_file='popular_tag.csv'):
data = pydelicious.get_popular(tag)
out = open(output_file, "a")
print 'Writing common of tag %s to file' % tag
print data
'''
for item in data:
try:
out.write(str(item.user))
out.write(",")
out.write(str(item.tags))
out.write(",")
out.write(str(item.href))
out.write(",")
out.write(str(item.dt))
out.write('\n')
except:
out.write('\r')
'''
out.close()
# This function gets the top five tags for a given link
# by looking at the linkdata.csv file for the link, which
# must already exist
def get_common_tag_list(link_data_file='linkdata.csv'):
input = open(link_data_file, 'r')
s = input.readlines()
InputTuple = []
tags = []
for line in s:
if len(line) < 3:
break
InputTuple = line.rsplit(',')
InputTuple = InputTuple[1].rsplit(' ')
for tag in InputTuple:
if len(tag) > 2:
tags.append(tag)
tags.sort()
#Get the top five tags
ind = []
top_tags = []
# Count the number of of each tags
for tag in tags:
ind.append(tags.count(tag))
# put the top five in a list
for i in range(0, 6):
hi = max(ind)
loc = ind.index(hi)
top_tags.append(tags[loc])
for n in range(0, len(ind)):
if ind[n] == hi:
ind[n] = 0
return top_tags
# This function takes a list of tags and puts the most relevant URLS
# based on a weighting scheme in the provided file
def get_relevant_urls(tag_list, output_file='good_urls.csv'):
for tag in tag_list:
get_tag_posts(tag, output_file)
#Find the top 25 urls and return them as a list
def weed_through_urls(tag_list, input_file='good_urls.csv'):
threshold = 5
topURLS = []
input = open(input_file, 'r')
s = input.readlines()
InputTuple = []
while( len(topURLS) < 25 ):
for line in s:
if len(line) < 3:
break
InputTuple = line.rsplit(",")
href = InputTuple[2]
InputTuple = InputTuple[1].rsplit(",")
InputTuple = InputTuple[0].rsplit(" ")
#Determine matches the hardway
counter = 0
if tag_list[0] in InputTuple:
counter = counter + 1
if tag_list[1] in InputTuple:
counter = counter + 1
if tag_list[2] in InputTuple:
counter = counter + 1
if tag_list[3] in InputTuple:
counter = counter + 1
if tag_list[4] in InputTuple:
counter = counter + 1
if counter > threshold:
if href.find("weigend.com") == -1:
topURLS.append(href)
if len(topURLS) > 24:
break
threshold = threshold -
1
return topURLS
# For each of the users who posted the original url it grabs their
# most recent 20 links and dumps them into a file
def dump_url_adjacent_posts(link_file='linkdata.csv', out_file='adj_links.csv'):
input = open(link_file,'r')
s = input.readlines()
InputTuple = []
for line in s:
if len(line) < 3:
break
InputTuple = line.rsplit(",")
#print "%s " % (InputTuple[0].rstrip())
write_user_urls(InputTuple[0].rstrip(), out_file)
input.close()
if __name__=='__main__':
# Get the posts for a given url and put data in a file
dump_urlposts('http://www.weigend.com/', 'linkdata.csv')
# Get adjacent urls and put them in a file
#dump_url_adjacent_posts()
get_tag_posts()
#get_tag_popular()
goodtags = get_common_tag_list('linkdata.csv')
get_relevant_urls(goodtags, 'good_urls.csv')
output = weed_through_urls(goodtags, 'good_urls.csv')
for line in output:
print line