Skip to content
Snippets Groups Projects
Commit 6e9dcf56 authored by Jess Ward's avatar Jess Ward
Browse files

Analyzing twitter API data

assign sentiment to words used in tweets
track frequency of hashtags and determine top ten used
determine happiest state by tweet sentiment and location tag
parents
No related branches found
No related tags found
No related merge requests found
Pipeline #11 skipped
from __future__ import division
import sys
import json
def frequency(tweet_file):
word_counts = {}
word_total = 0
for line in tweet_file:
tweet = json.loads(line)
try:
tweet_words = tweet['text'].split()
except KeyError:
#A line without text attribute is not a tweet- skip it
continue
for word in tweet_words:
word_total += 1
if word not in word_counts:
word_counts[word] = 1
else:
word_counts[word] += 1
for word in word_counts:
frequency = word_counts[word] / word_total
try:
print("{word} {freq:10.9f}".format(word=word, freq=frequency))
except UnicodeEncodeError:
continue
if __name__ == '__main__':
frequency(open(sys.argv[1]))
from __future__ import division
import sys
import json
states = {
'AK': 'Alaska',
'AL': 'Alabama',
'AR': 'Arkansas',
'AS': 'American Samoa',
'AZ': 'Arizona',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DC': 'District of Columbia',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'GU': 'Guam',
'HI': 'Hawaii',
'IA': 'Iowa',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'MA': 'Massachusetts',
'MD': 'Maryland',
'ME': 'Maine',
'MI': 'Michigan',
'MN': 'Minnesota',
'MO': 'Missouri',
'MP': 'Northern Mariana Islands',
'MS': 'Mississippi',
'MT': 'Montana',
'NA': 'National',
'NC': 'North Carolina',
'ND': 'North Dakota',
'NE': 'Nebraska',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NV': 'Nevada',
'NY': 'New York',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'PR': 'Puerto Rico',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VA': 'Virginia',
'VI': 'Virgin Islands',
'VT': 'Vermont',
'WA': 'Washington',
'WI': 'Wisconsin',
'WV': 'West Virginia',
'WY': 'Wyoming'
}
def parse_AFINN(afinnfile):
scores = {}
for line in afinnfile:
term, score = line.split("\t")
scores[term] = int(score)
return scores
def lookup_state(location_text):
if location_text:
for state in states:
if state.lower() in location_text.lower():
return state
def score_tweet(tweet_text, scores):
tweet_words = tweet_text.split()
tweet_total = 0
for word in tweet_words:
try:
tweet_total += scores[word]
except KeyError:
pass
return tweet_total
def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
scores = parse_AFINN(sent_file)
state_scores = {}
for line in tweet_file:
tweet = json.loads(line)
try:
tweet_location = tweet['user']['location']
except KeyError:
# Ignore line with no location or no user
continue
state = lookup_state(tweet_location)
if not state:
continue
try:
tweet_text = tweet['text']
except KeyError:
# Ignore any lines with no tweet text
continue
tweet_score = score_tweet(tweet_text, scores)
# Increment sentiment sum and tweet count for this state
if state not in state_scores:
state_scores[state] = {'sum': 0, 'count': 0}
state_scores[state]['sum'] += tweet_score
state_scores[state]['count'] += 1
for state in state_scores:
# Replace sum and count dicts with averages
s = state_scores[state]
avg_score = s['sum'] / s['count']
state_scores[state] = avg_score
print(max(state_scores, key=lambda key: state_scores[key]))
if __name__ == '__main__':
main()
This diff is collapsed.
import sys
import json
def parse_AFINN(afinnfile):
scores = {}
for line in afinnfile:
term,score = line.split("\t")
scores[term] = int(score)
return scores
def score_tweets(tweet_file, scores):
for line in tweet_file:
tweet = json.loads(line)
try:
tweet_words = tweet['text'].split()
except KeyError:
# A line without text attribute is not a tweet - skip it
continue
for index, word in enumerate(tweet_words):
if word not in scores:
word_total = 0
word_count = 0
for offset in [-3, -2, -1, 1, 2, 3]:
try:
word_total += scores[tweet_words[index + offset]]
word_count += 1
except (KeyError, IndexError):
pass
if word_count > 0:
word_total = word_total/word_count
print word, word_total
def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
scores = parse_AFINN(sent_file)
score_tweets(tweet_file, scores)
if __name__ == '__main__':
main()
from __future__ import division
import sys
import json
def hashtag_count(tweet_file):
hashtag_counts = {}
for line in tweet_file:
tweet = json.loads(line)
try:
hashtags = tweet['entities']['hashtags']
except KeyError:
#A line without text attribute is not a tweet- skip it
continue
for hashtag in hashtags:
hashtag = hashtag['text']
if hashtag not in hashtag_counts:
hashtag_counts[hashtag] = 1
else:
hashtag_counts[hashtag] += 1
return hashtag_counts
def main(tweet_file):
hashtag_counts = hashtag_count(tweet_file)
tuples = sorted(hashtag_counts.items(), key=lambda x:x[1], reverse=True)[:10]
for (hashtag, count) in tuples:
print("{0} {1}".format(hashtag, count))
if __name__ == '__main__':
main(open(sys.argv[1]))
import sys
import json
def parse_AFINN(afinnfile):
scores = {}
for line in afinnfile:
term, score = line.split("\t")
scores[term] = int(score)
return scores
def score_tweets(tweet_file, scores):
tweets = {}
for line in tweet_file:
tweet = json.loads(line)
try:
tweet_words = tweet['text'].split()
except KeyError:
# A line without text attribute is not a tweet - skip it
continue
tweet_total = 0
for word in tweet_words:
try:
tweet_total += scores[word]
except KeyError:
pass
print tweet_total
def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
scores = parse_AFINN(sent_file)
score_tweets(tweet_file, scores)
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment