mirror of
https://github.com/unitedstates/congress-legislators.git
synced 2025-12-19 09:50:37 -05:00
505 lines
18 KiB
Python
Executable File
505 lines
18 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
# run with --sweep (or by default):
|
|
# given a service, looks through current members for those missing an account on that service,
|
|
# and checks that member's official website's source code for mentions of that service.
|
|
# A CSV of "leads" is produced for manual review.
|
|
#
|
|
# run with --update:
|
|
# reads the CSV produced by --sweep back in and updates the YAML accordingly.
|
|
#
|
|
# run with --clean:
|
|
# removes legislators from the social media file who are no longer current
|
|
#
|
|
# run with --verify:
|
|
# verifies that current usernames are still valid. (tries to catch renames)
|
|
#
|
|
# run with --resolveyt:
|
|
# finds both YouTube usernames and channel IDs and updates the YAML accordingly.
|
|
|
|
# run with --resolvetw:
|
|
# for entries with `twitter` but not `twitter_id`
|
|
# resolves Twitter screen_names to Twitter IDs and updates the YAML accordingly
|
|
|
|
|
|
# other options:
|
|
# --service (required): "twitter", "youtube", "facebook", or "instagram"
|
|
# --bioguide: limit to only one particular member
|
|
# --email:
|
|
# in conjunction with --sweep, send an email if there are any new leads, using
|
|
# settings in scripts/email/config.yml (if it was created and filled out).
|
|
|
|
# uses a CSV at data/social_media_blacklist.csv to exclude known non-individual account names
|
|
|
|
import csv, json, re
|
|
import utils
|
|
from utils import load_data, save_data
|
|
import requests
|
|
import time
|
|
|
|
def main():
|
|
regexes = {
|
|
"youtube": [
|
|
"(?:https?:)?//(?:www\\.)?youtube.com/embed/?\?(list=[^\\s\"/\\?#&']+)",
|
|
"(?:https?:)?//(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)",
|
|
"(?:https?:)?//(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)"
|
|
],
|
|
"facebook": [
|
|
"\\('facebook.com/([^']+)'\\)",
|
|
"(?:https?:)?//(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)",
|
|
"(?:https?:)?//(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)"
|
|
],
|
|
"twitter": [
|
|
"(?:https?:)?//(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/?]+)",
|
|
"\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)"
|
|
],
|
|
"instagram": [
|
|
"instagram.com/(\w{3,})"
|
|
]
|
|
}
|
|
|
|
email_enabled = utils.flags().get('email', False)
|
|
debug = utils.flags().get('debug', False)
|
|
do_update = utils.flags().get('update', False)
|
|
do_clean = utils.flags().get('clean', False)
|
|
do_verify = utils.flags().get('verify', False)
|
|
do_resolveyt = utils.flags().get('resolveyt', False)
|
|
do_resolveig = utils.flags().get('resolveig', False)
|
|
do_resolvetw = utils.flags().get('resolvetw', False)
|
|
|
|
|
|
# default to not caching
|
|
cache = utils.flags().get('cache', False)
|
|
force = not cache
|
|
|
|
if do_resolveyt:
|
|
service = "youtube"
|
|
elif do_resolveig:
|
|
service = "instagram"
|
|
elif do_resolvetw:
|
|
service = "twitter"
|
|
else:
|
|
service = utils.flags().get('service', None)
|
|
if service not in ["twitter", "youtube", "facebook", "instagram"]:
|
|
print("--service must be one of twitter, youtube, facebook, or instagram")
|
|
exit(0)
|
|
|
|
# load in members, orient by bioguide ID
|
|
print("Loading current legislators...")
|
|
current = load_data("legislators-current.yaml")
|
|
|
|
current_bioguide = { }
|
|
for m in current:
|
|
if "bioguide" in m["id"]:
|
|
current_bioguide[m["id"]["bioguide"]] = m
|
|
|
|
print("Loading blacklist...")
|
|
blacklist = {
|
|
'twitter': [], 'facebook': [], 'youtube': [], 'instagram': []
|
|
}
|
|
for rec in csv.DictReader(open("data/social_media_blacklist.csv")):
|
|
blacklist[rec["service"]].append(rec["pattern"])
|
|
|
|
print("Loading whitelist...")
|
|
whitelist = {
|
|
'twitter': [], 'facebook': [], 'youtube': []
|
|
}
|
|
for rec in csv.DictReader(open("data/social_media_whitelist.csv")):
|
|
whitelist[rec["service"]].append(rec["account"].lower())
|
|
|
|
# reorient currently known social media by ID
|
|
print("Loading social media...")
|
|
media = load_data("legislators-social-media.yaml")
|
|
media_bioguide = { }
|
|
for m in media:
|
|
media_bioguide[m["id"]["bioguide"]] = m
|
|
|
|
|
|
def resolveyt():
|
|
# To avoid hitting quota limits, register for a YouTube 2.0 API key at
|
|
# https://code.google.com/apis/youtube/dashboard
|
|
# and put it below
|
|
api_file = open('cache/youtube_api_key','r')
|
|
api_key = api_file.read()
|
|
|
|
bioguide = utils.flags().get('bioguide', None)
|
|
|
|
updated_media = []
|
|
for m in media:
|
|
if bioguide and (m['id']['bioguide'] != bioguide):
|
|
updated_media.append(m)
|
|
continue
|
|
|
|
social = m['social']
|
|
|
|
if ('youtube' in social) or ('youtube_id' in social):
|
|
|
|
if 'youtube' not in social:
|
|
social['youtube'] = social['youtube_id']
|
|
|
|
ytid = social['youtube']
|
|
|
|
profile_url = ("https://gdata.youtube.com/feeds/api/users/%s"
|
|
"?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key))
|
|
|
|
try:
|
|
print("Resolving YT info for %s" % social['youtube'])
|
|
ytreq = requests.get(profile_url)
|
|
# print "\tFetched with status code %i..." % ytreq.status_code
|
|
|
|
if ytreq.status_code == 404:
|
|
# If the account name isn't valid, it's probably a redirect.
|
|
try:
|
|
# Try to scrape the real YouTube username
|
|
print("\Scraping YouTube username")
|
|
search_url = ("https://www.youtube.com/%s" % social['youtube'])
|
|
csearch = requests.get(search_url).text.encode('ascii','ignore')
|
|
|
|
u = re.search(r'<a[^>]*href="[^"]*/user/([^/"]*)"[.]*>',csearch)
|
|
|
|
if u:
|
|
print("\t%s maps to %s" % (social['youtube'],u.group(1)))
|
|
social['youtube'] = u.group(1)
|
|
profile_url = ("https://gdata.youtube.com/feeds/api/users/%s"
|
|
"?v=2&prettyprint=true&alt=json" % social['youtube'])
|
|
|
|
print("\tFetching GData profile...")
|
|
ytreq = requests.get(profile_url)
|
|
print("\tFetched GData profile")
|
|
|
|
else:
|
|
raise Exception("Couldn't figure out the username format for %s" % social['youtube'])
|
|
|
|
except:
|
|
print("\tCouldn't locate YouTube account")
|
|
raise
|
|
|
|
ytobj = ytreq.json()
|
|
social['youtube_id'] = ytobj['entry']['yt$channelId']['$t']
|
|
print("\tResolved youtube_id to %s" % social['youtube_id'])
|
|
|
|
# even though we have their channel ID, do they also have a username?
|
|
if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']:
|
|
if social['youtube'].lower() != ytobj['entry']['yt$username']['$t'].lower():
|
|
# YT accounts are case-insensitive. Preserve capitalization if possible.
|
|
social['youtube'] = ytobj['entry']['yt$username']['$t']
|
|
print("\tAdded YouTube username of %s" % social['youtube'])
|
|
else:
|
|
print("\tYouTube says they do not have a separate username")
|
|
del social['youtube']
|
|
except:
|
|
print("Unable to get YouTube Channel ID for: %s" % social['youtube'])
|
|
|
|
updated_media.append(m)
|
|
|
|
print("Saving social media...")
|
|
save_data(updated_media, "legislators-social-media.yaml")
|
|
|
|
|
|
def resolveig():
|
|
# in order to preserve the comment block at the top of the file,
|
|
# copy it over into a new RtYamlList instance. We do this because
|
|
# Python list instances can't hold other random attributes.
|
|
import rtyaml
|
|
updated_media = rtyaml.RtYamlList()
|
|
if hasattr(media, '__initial_comment_block'):
|
|
updated_media.__initial_comment_block = getattr(media, '__initial_comment_block')
|
|
|
|
client_id_file = open('cache/instagram_client_id','r')
|
|
client_id = client_id_file.read()
|
|
|
|
bioguide = utils.flags().get('bioguide', None)
|
|
|
|
for m in media:
|
|
if bioguide and (m['id']['bioguide'] != bioguide):
|
|
updated_media.append(m)
|
|
continue
|
|
|
|
social = m['social']
|
|
if 'instagram' not in social and 'instagram_id' not in social:
|
|
updated_media.append(m)
|
|
continue
|
|
|
|
instagram_handle = social['instagram']
|
|
query_url = "https://api.instagram.com/v1/users/search?q={query}&client_id={client_id}".format(query=instagram_handle,client_id=client_id)
|
|
instagram_user_search = requests.get(query_url).json()
|
|
for user in instagram_user_search['data']:
|
|
time.sleep(0.5)
|
|
if user['username'] == instagram_handle:
|
|
m['social']['instagram_id'] = int(user['id'])
|
|
print("matched instagram_id {instagram_id} to {instagram_handle}".format(instagram_id=social['instagram_id'],instagram_handle=instagram_handle))
|
|
updated_media.append(m)
|
|
|
|
save_data(updated_media, "legislators-social-media.yaml")
|
|
|
|
|
|
def resolvetw():
|
|
"""
|
|
Does two batch lookups:
|
|
|
|
1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name
|
|
as found in the entry's `twitter`. If not, the `twitter` value is updated.
|
|
2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and
|
|
inserts ID. If no profile is found, the `twitter` value is deleted.
|
|
|
|
Note: cache/twitter_client_id must be a formatted JSON dict:
|
|
{
|
|
"consumer_secret": "xyz",
|
|
"access_token": "abc",
|
|
"access_token_secret": "def",
|
|
"consumer_key": "jk"
|
|
}
|
|
"""
|
|
import rtyaml
|
|
from social.twitter import get_api, fetch_profiles
|
|
updated_media = rtyaml.RtYamlList()
|
|
if hasattr(media, '__initial_comment_block'):
|
|
updated_media.__initial_comment_block = getattr(media, '__initial_comment_block')
|
|
|
|
client_id_file = open('cache/twitter_client_id', 'r')
|
|
_c = json.load(client_id_file)
|
|
api = get_api(_c['access_token'], _c['access_token_secret'], _c['consumer_key'], _c['consumer_secret'])
|
|
bioguide = utils.flags().get('bioguide', None)
|
|
lookups = {'screen_names': [], 'ids': []} # store members that have `twitter` or `twitter_id` info
|
|
for m in media:
|
|
# we start with appending to updated_media so that we keep the same order of entries
|
|
# as found in the loaded file
|
|
updated_media.append(m)
|
|
if bioguide and (m['id']['bioguide'] != bioguide):
|
|
continue
|
|
social = m['social']
|
|
# now we add entries to either the `ids` or the `screen_names` list to batch lookup
|
|
if 'twitter_id' in social:
|
|
# add to the queue to be batched-looked-up
|
|
lookups['ids'].append(m)
|
|
# append
|
|
elif 'twitter' in social:
|
|
lookups['screen_names'].append(m)
|
|
|
|
#######################################
|
|
# perform Twitter batch lookup for ids:
|
|
if lookups['screen_names']:
|
|
arr = lookups['screen_names']
|
|
print("Looking up Twitter ids for", len(arr), "names.")
|
|
tw_names = [m['social']['twitter'] for m in arr]
|
|
tw_profiles = fetch_profiles(api, screen_names = tw_names)
|
|
for m in arr:
|
|
social = m['social']
|
|
# find profile that corresponds to a given screen_name
|
|
twitter_handle = social['twitter']
|
|
twp = next((p for p in tw_profiles if p['screen_name'].lower() == twitter_handle.lower()), None)
|
|
if twp:
|
|
m['social']['twitter_id'] = int(twp['id'])
|
|
print("Matched twitter_id `%s` to `%s`" % (social['twitter_id'], twitter_handle))
|
|
else:
|
|
# Remove errant Twitter entry for now
|
|
print("No Twitter user profile for:", twitter_handle)
|
|
m['social'].pop('twitter')
|
|
print("\t ! removing Twitter handle:", twitter_handle)
|
|
##########################################
|
|
# perform Twitter batch lookup for names by id, to update any renamings:
|
|
if lookups['ids']:
|
|
arr = lookups['ids']
|
|
print("Looking up Twitter screen_names for", len(arr), "ids.")
|
|
tw_ids = [m['social']['twitter_id'] for m in arr]
|
|
tw_profiles = fetch_profiles(api, ids = tw_ids)
|
|
any_renames_needed = False
|
|
for m in arr:
|
|
social = m['social']
|
|
# find profile that corresponds to a given screen_name
|
|
t_id = social['twitter_id']
|
|
t_name = social.get('twitter')
|
|
twp = next((p for p in tw_profiles if int(p['id']) == t_id), None)
|
|
if twp:
|
|
# Be silent if there is no change to screen name
|
|
if t_name and (twp['screen_name'].lower() == t_name.lower()):
|
|
pass
|
|
else:
|
|
any_renames_needed = True
|
|
m['social']['twitter'] = twp['screen_name']
|
|
print("For twitter_id `%s`, renamed `%s` to `%s`" % (t_id, t_name, m['social']['twitter']))
|
|
else:
|
|
# No entry found for this twitter id
|
|
print("No Twitter user profile for %s, %s" % (t_id, t_name))
|
|
m['social'].pop('twitter_id')
|
|
print("\t ! removing Twitter id:", t_id)
|
|
if not any_renames_needed:
|
|
print("No renames needed")
|
|
# all done with Twitter
|
|
save_data(updated_media, "legislators-social-media.yaml")
|
|
|
|
|
|
def sweep():
|
|
to_check = []
|
|
|
|
bioguide = utils.flags().get('bioguide', None)
|
|
if bioguide:
|
|
possibles = [bioguide]
|
|
else:
|
|
possibles = list(current_bioguide.keys())
|
|
|
|
for bioguide in possibles:
|
|
if media_bioguide.get(bioguide, None) is None:
|
|
to_check.append(bioguide)
|
|
elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \
|
|
(media_bioguide[bioguide]["social"].get(service + "_id", None) is None):
|
|
to_check.append(bioguide)
|
|
else:
|
|
pass
|
|
|
|
utils.mkdir_p("cache/social_media")
|
|
writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w'))
|
|
writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"])
|
|
|
|
if len(to_check) > 0:
|
|
rows_found = []
|
|
for bioguide in to_check:
|
|
candidate = candidate_for(bioguide)
|
|
if candidate:
|
|
url = current_bioguide[bioguide]["terms"][-1].get("url", None)
|
|
candidate_url = "https://%s.com/%s" % (service, candidate)
|
|
row = [bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url]
|
|
writer.writerow(row)
|
|
print("\tWrote: %s" % candidate)
|
|
rows_found.append(row)
|
|
|
|
if email_enabled and len(rows_found) > 0:
|
|
email_body = "Social media leads found:\n\n"
|
|
for row in rows_found:
|
|
email_body += ("%s\n" % row)
|
|
utils.send_email(email_body)
|
|
|
|
def verify():
|
|
bioguide = utils.flags().get('bioguide', None)
|
|
if bioguide:
|
|
to_check = [bioguide]
|
|
else:
|
|
to_check = list(media_bioguide.keys())
|
|
|
|
for bioguide in to_check:
|
|
entry = media_bioguide[bioguide]
|
|
current = entry['social'].get(service, None)
|
|
if not current:
|
|
continue
|
|
|
|
bioguide = entry['id']['bioguide']
|
|
|
|
candidate = candidate_for(bioguide, current)
|
|
if not candidate:
|
|
# if current is in whitelist, and none is on the page, that's okay
|
|
if current.lower() in whitelist[service]:
|
|
continue
|
|
else:
|
|
candidate = ""
|
|
|
|
url = current_bioguide[bioguide]['terms'][-1].get('url')
|
|
|
|
if current.lower() != candidate.lower():
|
|
print("[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate))
|
|
|
|
def update():
|
|
for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)):
|
|
bioguide = rec["bioguide"]
|
|
candidate = rec["candidate"]
|
|
|
|
if bioguide in media_bioguide:
|
|
media_bioguide[bioguide]['social'][service] = candidate
|
|
else:
|
|
new_media = {'id': {}, 'social': {}}
|
|
|
|
new_media['id']['bioguide'] = bioguide
|
|
thomas_id = current_bioguide[bioguide]['id'].get("thomas", None)
|
|
govtrack_id = current_bioguide[bioguide]['id'].get("govtrack", None)
|
|
if thomas_id:
|
|
new_media['id']['thomas'] = thomas_id
|
|
if govtrack_id:
|
|
new_media['id']['govtrack'] = govtrack_id
|
|
|
|
|
|
new_media['social'][service] = candidate
|
|
media.append(new_media)
|
|
|
|
print("Saving social media...")
|
|
save_data(media, "legislators-social-media.yaml")
|
|
|
|
# if it's a youtube update, always do the resolve
|
|
# if service == "youtube":
|
|
# resolveyt()
|
|
|
|
|
|
def clean():
|
|
print("Loading historical legislators...")
|
|
historical = load_data("legislators-historical.yaml")
|
|
|
|
count = 0
|
|
for m in historical:
|
|
if m["id"]["bioguide"] in media_bioguide:
|
|
media.remove(media_bioguide[m["id"]["bioguide"]])
|
|
count += 1
|
|
print("Removed %i out of office legislators from social media file..." % count)
|
|
|
|
print("Saving historical legislators...")
|
|
save_data(media, "legislators-social-media.yaml")
|
|
|
|
|
|
def candidate_for(bioguide, current = None):
|
|
"""find the most likely candidate account from the URL.
|
|
If current is passed, the candidate will match it if found
|
|
otherwise, the first candidate match is returned
|
|
"""
|
|
url = current_bioguide[bioguide]["terms"][-1].get("url", None)
|
|
if not url:
|
|
if debug:
|
|
print("[%s] No official website, skipping" % bioguide)
|
|
return None
|
|
|
|
if debug:
|
|
print("[%s] Downloading..." % bioguide)
|
|
cache = "congress/%s.html" % bioguide
|
|
body = utils.download(url, cache, force, {'check_redirects': True})
|
|
if not body:
|
|
return None
|
|
|
|
all_matches = []
|
|
for regex in regexes[service]:
|
|
matches = re.findall(regex, body, re.I)
|
|
if matches:
|
|
all_matches.extend(matches)
|
|
|
|
if not current == None and current in all_matches:
|
|
return current
|
|
|
|
if all_matches:
|
|
for candidate in all_matches:
|
|
passed = True
|
|
for blacked in blacklist[service]:
|
|
if re.search(blacked, candidate, re.I):
|
|
passed = False
|
|
|
|
if not passed:
|
|
if debug:
|
|
print("\tBlacklisted: %s" % candidate)
|
|
continue
|
|
|
|
return candidate
|
|
return None
|
|
|
|
if do_update:
|
|
update()
|
|
elif do_clean:
|
|
clean()
|
|
elif do_verify:
|
|
verify()
|
|
elif do_resolveyt:
|
|
resolveyt()
|
|
elif do_resolveig:
|
|
resolveig()
|
|
elif do_resolvetw:
|
|
resolvetw()
|
|
|
|
else:
|
|
sweep()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|