def fetchbin(self, start=None, end=None, binsize=timedelta(seconds=60),
empty=False):
"""
Returns a generator that can be used to iterate over the tweet data
based on ``binsize``.
:param start: Query start date.
:type start: datetime
:param end: Query end date.
:type end: datetime
:param binsize: Time duration for each bin for tweet grouping.
:type binsize: timedelta
:param empty: Determines whether empty dataframes will be yielded.
:type empty: boolean
:returns: A dataframe along with time boundaries for the data.
:rtype: tuple
"""
second = timedelta(seconds=1)
if start is None: start=self.start-second
if end is None: end=self.end
if start == self.start: start = start-second
df = self.tweet_dates
df = df.groupby(pd.TimeGrouper(freq=f'{int(binsize/second)}S')).size()
df = df[df.index > start - binsize]
if not empty: df = df[df != 0]
conn = sqlite3.connect(self._db, detect_types=sqlite3.PARSE_DECLTYPES)
c = conn.cursor()
c.execute(
"SELECT * FROM tweets WHERE created_at > ? AND created_at <= ?",
(start, end)
)
for i in range(0,len(df)):
frame = []
if df.iloc[i] > 0:
frame = pd.DataFrame.from_records(
data=c.fetchmany(df.iloc[i]), columns=self.fields,
index='created_at'
)
left = df.index[i].to_pydatetime()
right = left + binsize
if len(frame)>0 or empty: yield TweetBin(frame, left, right)
c.close()
评论列表
文章目录