#!/usr/bin/env python # run with --sweep (or by default): # given a service, looks through current members for those missing an account on that service, # and checks that member's official website's source code for mentions of that service. # A CSV of "leads" is produced for manual review. # # run with --update: # reads the CSV produced by --sweep back in and updates the YAML accordingly. # # run with --clean: # removes legislators from the social media file who are no longer current # # run with --verify: # verifies that current usernames are still valid. (tries to catch renames) # # run with --resolveyt: # finds both YouTube usernames and channel IDs and updates the YAML accordingly. # run with --resolvetw: # for entries with `twitter` but not `twitter_id` # resolves Twitter screen_names to Twitter IDs and updates the YAML accordingly # other options: # --service (required): "twitter", "youtube", "facebook", or "instagram" # --bioguide: limit to only one particular member # --email: # in conjunction with --sweep, send an email if there are any new leads, using # settings in scripts/email/config.yml (if it was created and filled out). # uses a CSV at data/social_media_blacklist.csv to exclude known non-individual account names import csv, json, re import utils from utils import load_data, save_data import requests import time def main(): regexes = { "youtube": [ "(?:https?:)?//(?:www\\.)?youtube.com/embed/?\?(list=[^\\s\"/\\?#&']+)", "(?:https?:)?//(?:www\\.)?youtube.com/channel/([^\\s\"/\\?#']+)", "(?:https?:)?//(?:www\\.)?youtube.com/(?:subscribe_widget\\?p=)?(?:subscription_center\\?add_user=)?(?:user/)?([^\\s\"/\\?#']+)" ], "facebook": [ "\\('facebook.com/([^']+)'\\)", "(?:https?:)?//(?:www\\.)?facebook.com/(?:home\\.php)?(?:business/dashboard/#/)?(?:government)?(?:#!/)?(?:#%21/)?(?:#/)?pages/[^/]+/(\\d+)", "(?:https?:)?//(?:www\\.)?facebook.com/(?:profile.php\\?id=)?(?:home\\.php)?(?:#!)?/?(?:people)?/?([^/\\s\"#\\?&']+)" ], "twitter": [ "(?:https?:)?//(?:www\\.)?twitter.com/(?:intent/user\?screen_name=)?(?:#!/)?(?:#%21/)?@?([^\\s\"'/?]+)", "\\.render\\(\\)\\.setUser\\('@?(.*?)'\\)\\.start\\(\\)" ], "instagram": [ "instagram.com/(\w{3,})" ] } email_enabled = utils.flags().get('email', False) debug = utils.flags().get('debug', False) do_update = utils.flags().get('update', False) do_clean = utils.flags().get('clean', False) do_verify = utils.flags().get('verify', False) do_resolveyt = utils.flags().get('resolveyt', False) do_resolveig = utils.flags().get('resolveig', False) do_resolvetw = utils.flags().get('resolvetw', False) # default to not caching cache = utils.flags().get('cache', False) force = not cache if do_resolveyt: service = "youtube" elif do_resolveig: service = "instagram" elif do_resolvetw: service = "twitter" else: service = utils.flags().get('service', None) if service not in ["twitter", "youtube", "facebook", "instagram"]: print("--service must be one of twitter, youtube, facebook, or instagram") exit(0) # load in members, orient by bioguide ID print("Loading current legislators...") current = load_data("legislators-current.yaml") current_bioguide = { } for m in current: if "bioguide" in m["id"]: current_bioguide[m["id"]["bioguide"]] = m print("Loading blacklist...") blacklist = { 'twitter': [], 'facebook': [], 'youtube': [], 'instagram': [] } for rec in csv.DictReader(open("data/social_media_blacklist.csv")): blacklist[rec["service"]].append(rec["pattern"]) print("Loading whitelist...") whitelist = { 'twitter': [], 'facebook': [], 'youtube': [] } for rec in csv.DictReader(open("data/social_media_whitelist.csv")): whitelist[rec["service"]].append(rec["account"].lower()) # reorient currently known social media by ID print("Loading social media...") media = load_data("legislators-social-media.yaml") media_bioguide = { } for m in media: media_bioguide[m["id"]["bioguide"]] = m def resolveyt(): # To avoid hitting quota limits, register for a YouTube 2.0 API key at # https://code.google.com/apis/youtube/dashboard # and put it below api_file = open('cache/youtube_api_key','r') api_key = api_file.read() bioguide = utils.flags().get('bioguide', None) updated_media = [] for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if ('youtube' in social) or ('youtube_id' in social): if 'youtube' not in social: social['youtube'] = social['youtube_id'] ytid = social['youtube'] profile_url = ("https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json&key=%s" % (ytid, api_key)) try: print("Resolving YT info for %s" % social['youtube']) ytreq = requests.get(profile_url) # print "\tFetched with status code %i..." % ytreq.status_code if ytreq.status_code == 404: # If the account name isn't valid, it's probably a redirect. try: # Try to scrape the real YouTube username print("\Scraping YouTube username") search_url = ("https://www.youtube.com/%s" % social['youtube']) csearch = requests.get(search_url).text.encode('ascii','ignore') u = re.search(r']*href="[^"]*/user/([^/"]*)"[.]*>',csearch) if u: print("\t%s maps to %s" % (social['youtube'],u.group(1))) social['youtube'] = u.group(1) profile_url = ("https://gdata.youtube.com/feeds/api/users/%s" "?v=2&prettyprint=true&alt=json" % social['youtube']) print("\tFetching GData profile...") ytreq = requests.get(profile_url) print("\tFetched GData profile") else: raise Exception("Couldn't figure out the username format for %s" % social['youtube']) except: print("\tCouldn't locate YouTube account") raise ytobj = ytreq.json() social['youtube_id'] = ytobj['entry']['yt$channelId']['$t'] print("\tResolved youtube_id to %s" % social['youtube_id']) # even though we have their channel ID, do they also have a username? if ytobj['entry']['yt$username']['$t'] != ytobj['entry']['yt$userId']['$t']: if social['youtube'].lower() != ytobj['entry']['yt$username']['$t'].lower(): # YT accounts are case-insensitive. Preserve capitalization if possible. social['youtube'] = ytobj['entry']['yt$username']['$t'] print("\tAdded YouTube username of %s" % social['youtube']) else: print("\tYouTube says they do not have a separate username") del social['youtube'] except: print("Unable to get YouTube Channel ID for: %s" % social['youtube']) updated_media.append(m) print("Saving social media...") save_data(updated_media, "legislators-social-media.yaml") def resolveig(): # in order to preserve the comment block at the top of the file, # copy it over into a new RtYamlList instance. We do this because # Python list instances can't hold other random attributes. import rtyaml updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr(media, '__initial_comment_block') client_id_file = open('cache/instagram_client_id','r') client_id = client_id_file.read() bioguide = utils.flags().get('bioguide', None) for m in media: if bioguide and (m['id']['bioguide'] != bioguide): updated_media.append(m) continue social = m['social'] if 'instagram' not in social and 'instagram_id' not in social: updated_media.append(m) continue instagram_handle = social['instagram'] query_url = "https://api.instagram.com/v1/users/search?q={query}&client_id={client_id}".format(query=instagram_handle,client_id=client_id) instagram_user_search = requests.get(query_url).json() for user in instagram_user_search['data']: time.sleep(0.5) if user['username'] == instagram_handle: m['social']['instagram_id'] = int(user['id']) print("matched instagram_id {instagram_id} to {instagram_handle}".format(instagram_id=social['instagram_id'],instagram_handle=instagram_handle)) updated_media.append(m) save_data(updated_media, "legislators-social-media.yaml") def resolvetw(): """ Does two batch lookups: 1. All entries with `twitter_id`: Checks to see if the corresponding Twitter profile has the same screen_name as found in the entry's `twitter`. If not, the `twitter` value is updated. 2. All entries with `twitter` (but not `twitter_id`): fetches the corresponding Twitter profile by screen_name and inserts ID. If no profile is found, the `twitter` value is deleted. Note: cache/twitter_client_id must be a formatted JSON dict: { "consumer_secret": "xyz", "access_token": "abc", "access_token_secret": "def", "consumer_key": "jk" } """ import rtyaml from social.twitter import get_api, fetch_profiles updated_media = rtyaml.RtYamlList() if hasattr(media, '__initial_comment_block'): updated_media.__initial_comment_block = getattr(media, '__initial_comment_block') client_id_file = open('cache/twitter_client_id', 'r') _c = json.load(client_id_file) api = get_api(_c['access_token'], _c['access_token_secret'], _c['consumer_key'], _c['consumer_secret']) bioguide = utils.flags().get('bioguide', None) lookups = {'screen_names': [], 'ids': []} # store members that have `twitter` or `twitter_id` info for m in media: # we start with appending to updated_media so that we keep the same order of entries # as found in the loaded file updated_media.append(m) if bioguide and (m['id']['bioguide'] != bioguide): continue social = m['social'] # now we add entries to either the `ids` or the `screen_names` list to batch lookup if 'twitter_id' in social: # add to the queue to be batched-looked-up lookups['ids'].append(m) # append elif 'twitter' in social: lookups['screen_names'].append(m) ####################################### # perform Twitter batch lookup for ids: if lookups['screen_names']: arr = lookups['screen_names'] print("Looking up Twitter ids for", len(arr), "names.") tw_names = [m['social']['twitter'] for m in arr] tw_profiles = fetch_profiles(api, screen_names = tw_names) for m in arr: social = m['social'] # find profile that corresponds to a given screen_name twitter_handle = social['twitter'] twp = next((p for p in tw_profiles if p['screen_name'].lower() == twitter_handle.lower()), None) if twp: m['social']['twitter_id'] = int(twp['id']) print("Matched twitter_id `%s` to `%s`" % (social['twitter_id'], twitter_handle)) else: # Remove errant Twitter entry for now print("No Twitter user profile for:", twitter_handle) m['social'].pop('twitter') print("\t ! removing Twitter handle:", twitter_handle) ########################################## # perform Twitter batch lookup for names by id, to update any renamings: if lookups['ids']: arr = lookups['ids'] print("Looking up Twitter screen_names for", len(arr), "ids.") tw_ids = [m['social']['twitter_id'] for m in arr] tw_profiles = fetch_profiles(api, ids = tw_ids) any_renames_needed = False for m in arr: social = m['social'] # find profile that corresponds to a given screen_name t_id = social['twitter_id'] t_name = social.get('twitter') twp = next((p for p in tw_profiles if int(p['id']) == t_id), None) if twp: # Be silent if there is no change to screen name if t_name and (twp['screen_name'].lower() == t_name.lower()): pass else: any_renames_needed = True m['social']['twitter'] = twp['screen_name'] print("For twitter_id `%s`, renamed `%s` to `%s`" % (t_id, t_name, m['social']['twitter'])) else: # No entry found for this twitter id print("No Twitter user profile for %s, %s" % (t_id, t_name)) m['social'].pop('twitter_id') print("\t ! removing Twitter id:", t_id) if not any_renames_needed: print("No renames needed") # all done with Twitter save_data(updated_media, "legislators-social-media.yaml") def sweep(): to_check = [] bioguide = utils.flags().get('bioguide', None) if bioguide: possibles = [bioguide] else: possibles = list(current_bioguide.keys()) for bioguide in possibles: if media_bioguide.get(bioguide, None) is None: to_check.append(bioguide) elif (media_bioguide[bioguide]["social"].get(service, None) is None) and \ (media_bioguide[bioguide]["social"].get(service + "_id", None) is None): to_check.append(bioguide) else: pass utils.mkdir_p("cache/social_media") writer = csv.writer(open("cache/social_media/%s_candidates.csv" % service, 'w')) writer.writerow(["bioguide", "official_full", "website", "service", "candidate", "candidate_url"]) if len(to_check) > 0: rows_found = [] for bioguide in to_check: candidate = candidate_for(bioguide) if candidate: url = current_bioguide[bioguide]["terms"][-1].get("url", None) candidate_url = "https://%s.com/%s" % (service, candidate) row = [bioguide, current_bioguide[bioguide]['name']['official_full'].encode('utf-8'), url, service, candidate, candidate_url] writer.writerow(row) print("\tWrote: %s" % candidate) rows_found.append(row) if email_enabled and len(rows_found) > 0: email_body = "Social media leads found:\n\n" for row in rows_found: email_body += ("%s\n" % row) utils.send_email(email_body) def verify(): bioguide = utils.flags().get('bioguide', None) if bioguide: to_check = [bioguide] else: to_check = list(media_bioguide.keys()) for bioguide in to_check: entry = media_bioguide[bioguide] current = entry['social'].get(service, None) if not current: continue bioguide = entry['id']['bioguide'] candidate = candidate_for(bioguide, current) if not candidate: # if current is in whitelist, and none is on the page, that's okay if current.lower() in whitelist[service]: continue else: candidate = "" url = current_bioguide[bioguide]['terms'][-1].get('url') if current.lower() != candidate.lower(): print("[%s] mismatch on %s - %s -> %s" % (bioguide, url, current, candidate)) def update(): for rec in csv.DictReader(open("cache/social_media/%s_candidates.csv" % service)): bioguide = rec["bioguide"] candidate = rec["candidate"] if bioguide in media_bioguide: media_bioguide[bioguide]['social'][service] = candidate else: new_media = {'id': {}, 'social': {}} new_media['id']['bioguide'] = bioguide thomas_id = current_bioguide[bioguide]['id'].get("thomas", None) govtrack_id = current_bioguide[bioguide]['id'].get("govtrack", None) if thomas_id: new_media['id']['thomas'] = thomas_id if govtrack_id: new_media['id']['govtrack'] = govtrack_id new_media['social'][service] = candidate media.append(new_media) print("Saving social media...") save_data(media, "legislators-social-media.yaml") # if it's a youtube update, always do the resolve # if service == "youtube": # resolveyt() def clean(): print("Loading historical legislators...") historical = load_data("legislators-historical.yaml") count = 0 for m in historical: if m["id"]["bioguide"] in media_bioguide: media.remove(media_bioguide[m["id"]["bioguide"]]) count += 1 print("Removed %i out of office legislators from social media file..." % count) print("Saving historical legislators...") save_data(media, "legislators-social-media.yaml") def candidate_for(bioguide, current = None): """find the most likely candidate account from the URL. If current is passed, the candidate will match it if found otherwise, the first candidate match is returned """ url = current_bioguide[bioguide]["terms"][-1].get("url", None) if not url: if debug: print("[%s] No official website, skipping" % bioguide) return None if debug: print("[%s] Downloading..." % bioguide) cache = "congress/%s.html" % bioguide body = utils.download(url, cache, force, {'check_redirects': True}) if not body: return None all_matches = [] for regex in regexes[service]: matches = re.findall(regex, body, re.I) if matches: all_matches.extend(matches) if not current == None and current in all_matches: return current if all_matches: for candidate in all_matches: passed = True for blacked in blacklist[service]: if re.search(blacked, candidate, re.I): passed = False if not passed: if debug: print("\tBlacklisted: %s" % candidate) continue return candidate return None if do_update: update() elif do_clean: clean() elif do_verify: verify() elif do_resolveyt: resolveyt() elif do_resolveig: resolveig() elif do_resolvetw: resolvetw() else: sweep() if __name__ == '__main__': main()