I would like to collect some past data from Twitter on some search terms for a project. I applied for a developers account to get access to the data in past(>30 days). This is the first time I am using twitter API to collect data, I need some help and suggestions if I am doing something wrong. I tried three ways to collect data and was unable to fetch for >7 days actually data from 2012-2017 worth.
First, I tried using “searchtweets” which is Python wrapper for Twitter Premium and Enterprise Search APIs.
from searchtweets import ResultStream, gen_rule_payload, load_credentials
# first load your credentials
premium_search_args=load_credentials(filename="./search_tweets_credentials.yaml",
yaml_key="search_tweets_dev",
env_overwrite=False)
from searchtweets import collect_results
searchQuery1 = '"#JustinBieber" "#drunksinging" OR "#carelessDriving" OR "#outofcontrol" OR "#spitonfans" OR "#recklessdriving" OR "#assault" OR "#Pissing" OR "#spitting"'
searchQuery2 ='("Justin Bieber") ("throwing eggs" OR "drunk singing" OR "out of control" OR "found drunk" OR "spit on his fans" OR "struck a paparazzo" OR "shirtless photo" OR "found guilty" OR "reckless driving" OR "Pissing in buckets") '
# query='("Justin Bieber") ("throwing eggs" OR "drunk singing" OR "out of control" OR "found drunk" OR "spit on his fans" OR "struck a paparazzo" OR "shirtless photo" OR "found guilty" OR "reckless driving" OR "Pissing in buckets") '
rule=gen_rule_payload(searchQuery2, results_per_call=100)
# tweets = collect_results(rule, max_results=100, result_stream_args=premium_search_args) # change this if you need to
rs = ResultStream(rule_payload=rule,
max_results=100,
max_pages=1,
**premium_search_args)
tweets = list(rs.stream())
I get the following error:
retrying request; current status code: 403
retrying request; current status code: 403
retrying request; current status code: 403
Second, I tried using RESTful_Twitter_API and was able to collect 38 tweets.
client_key = '****'
client_secret = '****'
import base64,requests
key_secret = '{}:{}'.format(client_key, client_secret).encode('ascii')
b64_encoded_key = base64.b64encode(key_secret)
b64_encoded_key = b64_encoded_key.decode('ascii')
base_url = 'https://api.twitter.com/'
auth_url = '{}oauth2/token'.format(base_url)
auth_headers = {
'Authorization': 'Basic {}'.format(b64_encoded_key),
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
}
auth_data = {
'grant_type': 'client_credentials'
}
auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data)
# I got the accesstoken here.
auth_resp.json()
search_headers = {
'Authorization': 'Bearer {}'.format(auth_resp.json()['access_token'])
}
search_params = {
'q': '("Justin Bieber") ("throwing eggs" OR "drunk singing" OR "out of control" OR "found drunk" OR "spit on his fans" OR "struck a paparazzo" OR "shirtless photo" OR "found guilty" OR "reckless driving" OR "Pissing in buckets") ',
'count': 100,
}
search_url = '{}1.1/search/tweets.json'.format(base_url)
search_resp = requests.get(search_url, headers=search_headers, params=search_params)
search_resp.status_code
tweet_data = search_resp.json()
tweet_data
import pandas as pd
unsafe_data= pd.DataFrame(columns=['text', 'created_at'])
for x in tweet_data['statuses']:
unsafe_data.loc[len(unsafe_data)]=[ x['text'],x['created_at']]
unsafe_data.shape

Third, finally, I tried Tweepy python library which accesses Twitter Standard API.
import sys,os, jsonpickle, tweepy, simplejson as json
def data_maker(query, flag):
'''1.It downloads tweets in *.json files at the tweets collected folder. It was necessary to bring all info. such
'created_at', user_id, etc. to see the behavior of Tweets' retrieval.
2.Then we read these *.json files and create only *.txt files(containing only tweets) in order to run pre-processing script.
It also outputs data frames to visualize in the notebook itself.
'''
# application's authentication
auth = tweepy.AppAuthHandler(consumer_key="****",\
consumer_secret="****")
#Setting up new api
api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
maxTweets = 10000
tweetsPerQry = 100
count = 0
data_dir = 'data_collection_methods/data/tweetsCollected/'
if not os.path.isdir(data_dir):
os.makedirs(data_dir)
file='%s.json' % flag
file_path=os.path.join(data_dir,file)
#Open a text file to save the tweets to
with open(file_path, 'w') as f:
#keeping an assumption that we are only dealing with "English" tweets.
try:
for tweet in tweepy.Cursor(api.search,q=query, lang='en').items(maxTweets) :
if tweet.text is not None:
f.write(jsonpickle.encode(tweet._json, unpicklable=False) + '\n')
count += 1
if count%1000==0:
print("Reached until",count,"and API call status is",api.rate_limit_status()['resources']['search'])
print("Successfully grabbed", count, "tweets")
except:
print("file created but once the limit exceeds, API sends exceptions, I handled it.")
# creating a df for the file
df= pd.DataFrame(columns=['userId','text','created_at'])
tweets_file = open(file_path, 'r')
text_path = os.path.join('data_collection_methods/data/tweetsText/','%s.txt' % flag) # create
with open(text_path, 'w') as text:
#first conversion into json object so that we can access it as a python Dict.
for line in tweets_file:
tweet = json.loads(line)
text.write(tweet['text'].replace('\n', '') + '\n') # removing any newline occurences & ADDing 1 in end and writing to .txt files.
df.loc[len(df)]=[tweet['user']['id'],tweet['text'],tweet['created_at']] # Just to visualize from/to dates & tweets in notebook .
return df
#created queries and their respective dataframes calling above data_maker() and later after preprocessing merging similar dfs.
unsafeQuery1 ='"#JustinBieber" "#drunksinging" OR "#carelessDriving" OR "#outofcontrol" OR "#spitonfans" OR "#recklessdriving" OR "#assault" OR "#Pissing" OR "#spitting"'
unsafeQuery2 ='("Justin Bieber") ("throwing eggs" OR "drunk singing" OR "out of control" OR "found drunk" OR "spit on his fans" OR "struck a paparazzo" OR "shirtless photo" OR "found guilty" OR "reckless driving" OR "Pissing in buckets") '
safeQuery1 = '@justinbieber #fun OR #enjoy OR #Ecstasy'
safeQuery2 = '@justinbieber'
unsafe_df1=data_maker(unsafeQuery1, flag='unsafeQuery1')
unsafe_df2=data_maker(unsafeQuery2, flag='unsafeQuery2')
safe_df1=data_maker(safeQuery1, flag='safeQuery1')
safe_df2=data_maker(safeQuery2, flag='safeQuery2')
print('Initially :',unsafe_df1.shape, unsafe_df2.shape, safe_df1.shape, safe_df2.shape)
Result was:
Successfully grabed 0 tweets
Successfully grabed 38 tweets
Successfully grabed 3 tweets
Reached until 1000 and API call status is {'/search/tweets': {'limit': 450, 'remaining': 368, 'reset': 1520218776}}
Reached until 2000 and API call status is {'/search/tweets': {'limit': 450, 'remaining': 278, 'reset': 1520218776}}
Reached until 3000 and API call status is {'/search/tweets': {'limit': 450, 'remaining': 175, 'reset': 1520218776}}
Successfully grabed 3873 tweets
Initially : (0, 3) (38, 3) (3, 3) (3873, 3)
Kindly ignore the retrieval of 3873 tweets under “safeQuery2 = '@justinbieber'” query as I did this query to ensure there isn’t any problem with my code.
The problem is that if I try the above queries in any of the tweet retrieval processes, I am unsuccessful to grab the tweets for let say the year 2014-2017, but if I use the exact same queries and search in the “Search box”, I am able to see the results in that year. Could anyone help by?suggesting where I am going wrong and what I am doing wrong.