Bill Whiteley
# Python Recommender System
# Bill Whiteley, Stats 252 Homework 5

# Imports
import sys
import time
import os
import stat
import csv
import pydelicious
import datetime


# This function gets all of the url posts for a given url and stores them
# in a file
def dump_urlposts(link='http://www.weigend.com/', output_file='linkdata.csv'):
    time.sleep(1)
    data = pydelicious.get_urlposts(link)
    out = open(output_file, "w")
    print 'Writing url www posts to file %s' % output_file
    for item in data:
        out.write(item.user)
        out.write(",")
        out.write(item.tags)
        out.write(",")
        out.write(item.dt)
        out.write('\n')
    out.close()
   
    time.sleep(1)
    data = pydelicious.get_urlposts(link.rstrip('www.WWW.'))
    out = open(output_file, "a")
    print 'Writing url (-www) posts to file %s' % output_file
    for item in data:
        out.write(item.user)
        out.write(",")
        out.write(item.tags)
        out.write(",")
        out.write(item.dt)
        out.write('\n')
    out.close()
   
def write_user_urls(user='whitewi4', output_file='adj_links.csv'):
    time.sleep(1)
    data = pydelicious.get_userposts(user)
    out = open(output_file, "a")
    print 'Writing adj links to file for user %s' % user
   
    for item in data:
        try:
            out.write(str(item.user))
            out.write(",")
            out.write(str(item.tags))
            out.write(",")
            out.write(str(item.href))
            out.write('\n')
        except:
            out.write('\r')
    out.close()
   
# Function to query recent links with a given tag   
def get_tag_posts(tag='weigend', output_file='common_tag.csv'):
  
    time.sleep(2)
    data = pydelicious.get_tagposts(tag)
    out = open(output_file, "a")
    print 'Writing common of tag %s to file' % tag
   
    for item in data:
        try:
            out.write(str(item.user))
            out.write(",")
            out.write(str(item.tags))
            out.write(",")
            out.write(str(item.href))
            out.write(",")
            out.write(str(item.dt))
            out.write('\n')
        except:
            out.write('\r')
    out.close()



def get_tag_popular(tag='weigend', output_file='popular_tag.csv'):
  
    data = pydelicious.get_popular(tag)
    out = open(output_file, "a")
    print 'Writing common of tag %s to file' % tag
    print data
    '''
    for item in data:
        try:
            out.write(str(item.user))
            out.write(",")
            out.write(str(item.tags))
            out.write(",")
            out.write(str(item.href))
            out.write(",")
            out.write(str(item.dt))
            out.write('\n')
        except:
            out.write('\r')
    '''
    out.close()

# This function gets the top five tags for a given link
# by looking at the linkdata.csv file for the link, which
# must already exist
def get_common_tag_list(link_data_file='linkdata.csv'):

    input = open(link_data_file, 'r')
    s = input.readlines()
    InputTuple = []
    tags = []
    for line in s:
        if len(line) < 3:
            break
       
        InputTuple = line.rsplit(',')
        InputTuple = InputTuple[1].rsplit(' ')
        for tag in InputTuple:
            if len(tag) > 2:
                tags.append(tag)
    tags.sort()
    #Get the top five tags
    ind = []
    top_tags = []
    # Count the number of of each tags
    for tag in tags:
        ind.append(tags.count(tag))
    # put the top five in a list
    for i in range(0, 6):
        hi = max(ind)
        loc = ind.index(hi)
        top_tags.append(tags[loc])
        for n in range(0, len(ind)):
            if ind[n] == hi:
                ind[n] = 0
    return top_tags
   
# This function takes a list of tags and puts the most relevant URLS
# based on a weighting scheme in the provided file
def get_relevant_urls(tag_list, output_file='good_urls.csv'):

    for tag in tag_list:
        get_tag_posts(tag, output_file)
   
#Find the top 25 urls and return them as a list   
def weed_through_urls(tag_list, input_file='good_urls.csv'):

    threshold = 5
    topURLS = []
    input = open(input_file, 'r')
    s = input.readlines()
    InputTuple = []
    while( len(topURLS) < 25 ):
   
        for line in s:
            if len(line) < 3:
                break
            InputTuple = line.rsplit(",")
            href = InputTuple[2]
            InputTuple = InputTuple[1].rsplit(",")
            InputTuple = InputTuple[0].rsplit(" ")   
       
            #Determine matches the hardway
            counter = 0
            if tag_list[0] in InputTuple:
                counter = counter + 1
            if tag_list[1] in InputTuple:
                counter = counter + 1
            if tag_list[2] in InputTuple:
                counter = counter + 1
            if tag_list[3] in InputTuple:
                counter = counter + 1
            if tag_list[4] in InputTuple:
                counter = counter + 1
 
           
            if counter > threshold:
                if href.find("weigend.com") == -1:
                    topURLS.append(href)
                if len(topURLS) > 24:
                     break
        threshold = threshold - 1            
    return topURLS
   
# For each of the users who posted the original url it grabs their
# most recent 20 links and dumps them into a file
def dump_url_adjacent_posts(link_file='linkdata.csv', out_file='adj_links.csv'):
   
    input = open(link_file,'r')
    s = input.readlines()
    InputTuple = []
    for line in s:
        if len(line) < 3:
            break
        InputTuple = line.rsplit(",")
        #print "%s " % (InputTuple[0].rstrip())
        write_user_urls(InputTuple[0].rstrip(), out_file)  
    input.close()
     

if __name__=='__main__':

    # Get the posts for a given url and put data in a file
    dump_urlposts('http://www.weigend.com/', 'linkdata.csv')
    # Get adjacent urls and put them in a file
    #dump_url_adjacent_posts()
   
    get_tag_posts()
    #get_tag_popular()
    goodtags = get_common_tag_list('linkdata.csv')
    get_relevant_urls(goodtags, 'good_urls.csv')
    output = weed_through_urls(goodtags, 'good_urls.csv')
    for line in output:
        print line