mirror of
https://github.com/unitedstates/congress-legislators.git
synced 2025-12-19 18:05:51 -05:00
added resolvetw() to social_media.py
This commit is contained in:
@@ -1,129 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This script makes the following adjustment to legislators-social-media.yaml:
|
||||
# 1. For every entry that has a `social.twitter` (i.e. `screen_name `entry but
|
||||
# NOT a `social.twitter_id`, a `social.twitter_id` key is created with the value
|
||||
# set to the user `id` that corresponds to the user's `screen_name`, as returned by
|
||||
# # https://dev.twitter.com/rest/reference/get/users/lookup
|
||||
#
|
||||
# 2. In "prune" mode, every `twitter` value is looked up on Twitter's API. If Twitter does not
|
||||
# return a corresponding user profile, the twitter value is deleted from the data file.
|
||||
#
|
||||
# options:
|
||||
# --prune: Go into "prune" mode and delete all `twitter` entries for which Twitter API has no
|
||||
# profile information.
|
||||
#
|
||||
# --creds: points to a JSON file that contains Twitter credentials:
|
||||
# {
|
||||
# "consumer_secret": "xyz",
|
||||
# "access_token": "abc",
|
||||
# "access_token_secret": "def",
|
||||
# "consumer_key": "jk"
|
||||
# }
|
||||
import json
|
||||
import utils
|
||||
from copy import deepcopy
|
||||
from social_utils import get_twitter_api, fetch_twitter_profiles_for_screen_names
|
||||
from os.path import expanduser
|
||||
|
||||
def run():
|
||||
creds_path = utils.flags().get('creds', "")
|
||||
prune_mode = utils.flags().get('prune', False)
|
||||
legis_data = utils.load_data("legislators-social-media.yaml")
|
||||
_databak = deepcopy(legis_data)
|
||||
social_data = [d['social'] for d in legis_data if d['social'].get('twitter')]
|
||||
# no creds is required yet, since we're just scanning the existing data file
|
||||
objs = find_accounts_without_ids(social_data)
|
||||
if not objs:
|
||||
print("Nothing to be done; all Twitter screen names have associated IDs.")
|
||||
return
|
||||
print("Missing Twitter IDs for:", ''.join(["\n - " + o['twitter'] for o in objs]))
|
||||
api = load_creds(creds_path)
|
||||
# legis_data is mutated here
|
||||
find_and_insert_missing_ids(api, socials = objs, prune_mode = prune_mode)
|
||||
# Write to file if changes made
|
||||
if legis_data != _databak:
|
||||
utils.save_data(legis_data, "legislators-social-media.yaml")
|
||||
else:
|
||||
print("No changes made to legislators-social-media.yaml")
|
||||
|
||||
|
||||
def find_accounts_without_ids(socials):
|
||||
"""
|
||||
socials is a list of dicts: [{'twitter': 'ev', 'facebook': 'Eve'}]
|
||||
|
||||
Returns: list, filtered for dicts that have `twitter` but not `twitter_id`
|
||||
"""
|
||||
arr = [d for d in socials if d.get('twitter') and not d.get('twitter_id')]
|
||||
return arr
|
||||
|
||||
def find_and_insert_missing_ids(api, socials, prune_mode = False):
|
||||
"""
|
||||
given a list of dicts, call Twitter API and find profiles for social['twitter'] and
|
||||
insert 'twitter_id' attribute
|
||||
|
||||
Returns: Nothing, socials is mutated
|
||||
"""
|
||||
tnames = [s['twitter'] for s in socials]
|
||||
profiles = fetch_twitter_profiles_for_screen_names(api, tnames)
|
||||
for soc in socials:
|
||||
twitter_name = soc['twitter']
|
||||
# find a profile that has the given screen name
|
||||
profile = next((p for p in profiles if p['screen_name'].lower() == twitter_name.lower()), None)
|
||||
if profile:
|
||||
print("Match:\t%s\t%s" % (twitter_name, profile['id']))
|
||||
soc['twitter_id'] = profile['id']
|
||||
else:
|
||||
print("No Twitter user profile for:\t", twitter_name)
|
||||
if prune_mode:
|
||||
print("\t...removing", twitter_name)
|
||||
soc.pop('twitter')
|
||||
|
||||
def load_creds(creds_path):
|
||||
"""
|
||||
Convenience method for get_twitter_api in which creds_path points to a JSON
|
||||
"""
|
||||
|
||||
if not creds_path:
|
||||
raise RuntimeError("Twitter credentials required; specify path with --creds='some.json'")
|
||||
else:
|
||||
creds = json.load(open(expanduser(creds_path)))
|
||||
# filter keys
|
||||
fcreds = { k: creds[k] for k in ['access_token', 'access_token_secret', 'consumer_key', 'consumer_secret']}
|
||||
api = get_twitter_api(**fcreds)
|
||||
return api
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
|
||||
|
||||
# if __name__ == '__main__':
|
||||
#
|
||||
# # since we're dealing with only ~500 account names, let's just
|
||||
# # brute force collect them up front
|
||||
# tnames = [d['social']['twitter'] for d in data if d['social'].get('twitter')]
|
||||
# # initialize twitter API
|
||||
# api = get_twitter_api(json.load(open(MY_TWIT_CREDS_PATH)))
|
||||
# print("Fetching Twitter profiles")
|
||||
# profiles = get_twitter_profiles_from_screen_names(api, tnames)
|
||||
|
||||
|
||||
# for d in data:
|
||||
# soc = d['social']
|
||||
# tname = soc.get('twitter')
|
||||
# if tname and not soc.get('twitter_id'):
|
||||
# profile = next((p for p in profiles if p['screen_name'].lower() == tname.lower()), None)
|
||||
# if profile:
|
||||
# soc['twitter_id'] = profile['id']
|
||||
# else:
|
||||
# print(tname, 'does not exist, removing from data file')
|
||||
# # remove it
|
||||
# soc.pop('twitter')
|
||||
|
||||
# with open(OUTPUT_DATA_PATH, "w") as ofile:
|
||||
# utils.save_data(data, "legislators-social-media.yaml")
|
||||
@@ -1,9 +1,9 @@
|
||||
# Helpful functions for accessing social media APIs
|
||||
# Helpful functions for accessing Twitter
|
||||
import tweepy
|
||||
TWITTER_PROFILE_BATCH_SIZE = 100
|
||||
from math import ceil
|
||||
|
||||
def get_twitter_api(access_token, access_token_secret, consumer_key, consumer_secret):
|
||||
def get_api(access_token, access_token_secret, consumer_key, consumer_secret):
|
||||
"""
|
||||
Takes care of the Twitter OAuth authentication process and
|
||||
creates an API-handler to execute commands on Twitter
|
||||
@@ -19,7 +19,7 @@ def get_twitter_api(access_token, access_token_secret, consumer_key, consumer_se
|
||||
# create an API handler
|
||||
return tweepy.API(auth)
|
||||
|
||||
def fetch_twitter_profiles_for_screen_names(api, screen_names):
|
||||
def fetch_profiles(api, screen_names = [], ids = []):
|
||||
"""
|
||||
A wrapper method around tweepy.API.lookup_users that handles the batch lookup of
|
||||
screen_names. Assuming number of screen_names < 10000, this should not typically
|
||||
@@ -31,12 +31,13 @@ def fetch_twitter_profiles_for_screen_names(api, screen_names):
|
||||
Returns: a list of dicts representing Twitter profiles
|
||||
"""
|
||||
profiles = []
|
||||
for batch_idx in range(ceil(len(screen_names) / TWITTER_PROFILE_BATCH_SIZE)):
|
||||
key, lookups = ['user_ids', ids] if ids else ['screen_names', screen_names]
|
||||
for batch_idx in range(ceil(len(lookups) / TWITTER_PROFILE_BATCH_SIZE)):
|
||||
offset = batch_idx * TWITTER_PROFILE_BATCH_SIZE
|
||||
# break screen_names list into batches of TWITTER_PROFILE_BATCH_SIZE
|
||||
batch = screen_names[offset:(offset + TWITTER_PROFILE_BATCH_SIZE)]
|
||||
# break lookups list into batches of TWITTER_PROFILE_BATCH_SIZE
|
||||
batch = lookups[offset:(offset + TWITTER_PROFILE_BATCH_SIZE)]
|
||||
try:
|
||||
for user in api.lookup_users(screen_names = batch):
|
||||
for user in api.lookup_users(**{key: batch}):
|
||||
profiles.append(user._json)
|
||||
# catch situation in which none of the names in the batch are found
|
||||
# or else Tweepy will error out
|
||||
@@ -46,4 +47,3 @@ def fetch_twitter_profiles_for_screen_names(api, screen_names):
|
||||
else: # some other error, raise the exception
|
||||
raise e
|
||||
return profiles
|
||||
|
||||
@@ -20,6 +20,11 @@
|
||||
# run with --resolveyt:
|
||||
# finds both YouTube usernames and channel IDs and updates the YAML accordingly.
|
||||
|
||||
# run with --resolvetw:
|
||||
# for entries with `twitter` but not `twitter_id`
|
||||
# resolves Twitter screen_names to Twitter IDs and updates the YAML accordingly
|
||||
|
||||
|
||||
# other options:
|
||||
# --service (required): "twitter", "youtube", "facebook", or "instagram"
|
||||
# --bioguide: limit to only one particular member
|
||||
@@ -29,7 +34,7 @@
|
||||
|
||||
# uses a CSV at data/social_media_blacklist.csv to exclude known non-individual account names
|
||||
|
||||
import csv, re
|
||||
import csv, json, re
|
||||
import utils
|
||||
from utils import load_data, save_data
|
||||
import requests
|
||||
@@ -63,6 +68,7 @@ def main():
|
||||
do_resolvefb = utils.flags().get('resolvefb', False)
|
||||
do_resolveyt = utils.flags().get('resolveyt', False)
|
||||
do_resolveig = utils.flags().get('resolveig', False)
|
||||
do_resolvetw = utils.flags().get('resolvetw', False)
|
||||
|
||||
|
||||
# default to not caching
|
||||
@@ -75,6 +81,8 @@ def main():
|
||||
service = "youtube"
|
||||
elif do_resolveig:
|
||||
service = "instagram"
|
||||
elif do_resolvetw:
|
||||
service = "twitter"
|
||||
else:
|
||||
service = utils.flags().get('service', None)
|
||||
if service not in ["twitter", "youtube", "facebook", "instagram"]:
|
||||
@@ -272,6 +280,103 @@ def main():
|
||||
|
||||
save_data(updated_media, "legislators-social-media.yaml")
|
||||
|
||||
|
||||
def resolvetw():
|
||||
"""
|
||||
Does two batch lookups:
|
||||
|
||||
1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name
|
||||
as found in the entry's `twitter`. If not, the `twitter` value is updated.
|
||||
2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and
|
||||
inserts ID. If no profile is found, the `twitter` value is deleted.
|
||||
|
||||
Note: cache/twitter_client_id must be a formatted JSON dict:
|
||||
{
|
||||
"consumer_secret": "xyz",
|
||||
"access_token": "abc",
|
||||
"access_token_secret": "def",
|
||||
"consumer_key": "jk"
|
||||
}
|
||||
"""
|
||||
import rtyaml
|
||||
from social.twitter import get_api, fetch_profiles
|
||||
updated_media = rtyaml.RtYamlList()
|
||||
if hasattr(media, '__initial_comment_block'):
|
||||
updated_media.__initial_comment_block = getattr(media, '__initial_comment_block')
|
||||
|
||||
client_id_file = open('cache/twitter_client_id', 'r')
|
||||
_c = json.load(client_id_file)
|
||||
api = get_api(_c['access_token'], _c['access_token_secret'], _c['consumer_key'], _c['consumer_secret'])
|
||||
bioguide = utils.flags().get('bioguide', None)
|
||||
lookups = {'screen_names': [], 'ids': []} # store members that have `twitter` or `twitter_id` info
|
||||
for m in media:
|
||||
# we start with appending to updated_media so that we keep the same order of entries
|
||||
# as found in the loaded file
|
||||
updated_media.append(m)
|
||||
if bioguide and (m['id']['bioguide'] != bioguide):
|
||||
continue
|
||||
social = m['social']
|
||||
# now we add entries to either the `ids` or the `screen_names` list to batch lookup
|
||||
if 'twitter_id' in social:
|
||||
# add to the queue to be batched-looked-up
|
||||
lookups['ids'].append(m)
|
||||
# append
|
||||
elif 'twitter' in social:
|
||||
lookups['screen_names'].append(m)
|
||||
|
||||
#######################################
|
||||
# perform Twitter batch lookup for ids:
|
||||
if lookups['screen_names']:
|
||||
arr = lookups['screen_names']
|
||||
print("Looking up Twitter ids for", len(arr), "names.")
|
||||
tw_names = [m['social']['twitter'] for m in arr]
|
||||
tw_profiles = fetch_profiles(api, screen_names = tw_names)
|
||||
for m in arr:
|
||||
social = m['social']
|
||||
# find profile that corresponds to a given screen_name
|
||||
twitter_handle = social['twitter']
|
||||
twp = next((p for p in tw_profiles if p['screen_name'].lower() == twitter_handle.lower()), None)
|
||||
if twp:
|
||||
m['social']['twitter_id'] = int(twp['id'])
|
||||
print("Matched twitter_id `%s` to `%s`" % (social['twitter_id'], twitter_handle))
|
||||
else:
|
||||
# Remove errant Twitter entry for now
|
||||
print("No Twitter user profile for:", twitter_handle)
|
||||
m['social'].pop('twitter')
|
||||
print("\t ! removing Twitter handle:", twitter_handle)
|
||||
##########################################
|
||||
# perform Twitter batch lookup for names by id, to update any renamings:
|
||||
if lookups['ids']:
|
||||
arr = lookups['ids']
|
||||
print("Looking up Twitter screen_names for", len(arr), "ids.")
|
||||
tw_ids = [m['social']['twitter_id'] for m in arr]
|
||||
tw_profiles = fetch_profiles(api, ids = tw_ids)
|
||||
any_renames_needed = False
|
||||
for m in arr:
|
||||
social = m['social']
|
||||
# find profile that corresponds to a given screen_name
|
||||
t_id = social['twitter_id']
|
||||
t_name = social.get('twitter')
|
||||
twp = next((p for p in tw_profiles if int(p['id']) == t_id), None)
|
||||
if twp:
|
||||
# Be silent if there is no change to screen name
|
||||
if t_name and (twp['screen_name'].lower() == t_name.lower()):
|
||||
pass
|
||||
else:
|
||||
any_renames_needed = True
|
||||
m['social']['twitter'] = twp['screen_name']
|
||||
print("For twitter_id `%s`, renamed `%s` to `%s`" % (t_id, t_name, m['social']['twitter']))
|
||||
else:
|
||||
# No entry found for this twitter id
|
||||
print("No Twitter user profile for %s, %s" % (t_id, t_name))
|
||||
m['social'].pop('twitter_id')
|
||||
print("\t ! removing Twitter id:", t_id)
|
||||
if not any_renames_needed:
|
||||
print("No renames needed")
|
||||
# all done with Twitter
|
||||
save_data(updated_media, "legislators-social-media.yaml")
|
||||
|
||||
|
||||
def sweep():
|
||||
to_check = []
|
||||
|
||||
@@ -430,6 +535,9 @@ def main():
|
||||
resolveyt()
|
||||
elif do_resolveig:
|
||||
resolveig()
|
||||
elif do_resolvetw:
|
||||
resolvetw()
|
||||
|
||||
else:
|
||||
sweep()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user