# nopycln: file
from typing import Tuple, List
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from wordcloud import WordCloud
from typing import Optional
from twixl.collections.twinl import SearchResults, WordFrequencyResults, TweetMetrics
[docs]def plot_word_cloud(
word_frequency_results: WordFrequencyResults,
width: int = 800,
height: int = 400,
max_words: int = 200,
stopwords: Optional[List[str]] = None,
background_color: str = "white",
min_word_length: int = 0,
) -> Figure:
"""Plots the word-frequency list as a wordcloud.
:param word_frequency_results: Word frequency results.
:param width: Width of the canvas.
:param height: Height of the canvas.
:param max_words: The maximum number of words in the wordcloud.
:param stopwords: A list of stopwords that should be filtered from the wordcloud.
:param background_color: Background color for the word cloud image.
:param min_word_length: Minimum number of letters a word must
have to be included.
:return: Word cloud plot
Usage::
>>> from twixl.collections import twinl
>>> twinl.plotting.plot_word_cloud(
>>> word_frequencies,
>>> stopwords=stopwords,
>>> max_words=100,
>>> min_word_length=4
>>> );
<matplotlib.image.AxesImage>
"""
df = word_frequency_results.to_pandas(by_hour=False)
if stopwords:
# Filter stopwords from word frequency dataframe.
df = df[~df["word"].isin(stopwords)]
data = df[df["frequency"] > 1000].set_index("word").to_dict()["frequency"] # type: ignore
wc = WordCloud(
width=width,
height=height,
max_words=max_words,
background_color=background_color,
stopwords=stopwords,
min_word_length=min_word_length,
).generate_from_frequencies(data)
# Set figure size
# TODO: This should be configurable?
plt.figure()
# No axis details
plt.axis("off")
return plt.imshow(wc)
[docs]def plot_circular_bars(
word_frequency_results: WordFrequencyResults,
stopwords: Optional[List[str]] = None,
group_size: int = 5,
) -> Tuple[Figure, Axes]:
"""Plots the word-frequency list as a circular bar plot.
:param stopwords: A list of stopwords that should be filtered from the wordcloud.
:param group_size: The number of bars per hour.
Usage::
>>> from twixl.collections import twinl
>>> twinl.plotting.plot_circular_bars(
>>> word_frequencies,
>>> stopwords=stopwords,
>>> group_size=3
>>> );
(<Figure>, <PolarAxesSubplot>)
"""
df = word_frequency_results.to_pandas(by_hour=True)
if stopwords:
# Filter stopwords from word frequency dataframe.
df = df[~df["word"].isin(stopwords)]
GROUPS_SIZE = [group_size] * 24
df = df.groupby("hour").head(group_size).sort_values(by=["hour", "frequency"])
# Raise an error if we do not have enough words per day to plot based on the group size
# TODO: if we want to keep this method around, fix the plotting instead of raising this error
min_words_per_day = df.groupby("hour").count()["frequency"].min()
if min_words_per_day < group_size:
raise ValueError(
f"maximum group size based on provided data is {min_words_per_day}, but {group_size} expected, please lower group size"
)
# Reindex/reverse arrays to plot bars clockwise
VALUES = df["frequency"].reindex(index=df["frequency"].index[::-1])
LABELS = df["word"].reindex(index=df["word"].index[::-1])
BASE_LABELS = [f"{i:02d}:00" for i in range(1, 24)]
BASE_LABELS.append("00:00")
BASE_LABELS.reverse()
GROUP = df["hour"].values
OFFSET = 0
PAD = 3
ANGLES_N = len(VALUES) + PAD * len(np.unique(GROUP)) # type: ignore
ANGLES = np.linspace(0, 2 * np.pi, num=ANGLES_N, endpoint=False)
ANGLES = ANGLES + 0.5 * np.pi
WIDTH = (2 * np.pi) / len(ANGLES)
offset = OFFSET
IDXS = []
for size in GROUPS_SIZE:
IDXS += list(range(offset + PAD, offset + size + PAD))
offset += size + PAD
COLORS = [f"C{i}" for i, size in enumerate(GROUPS_SIZE) for _ in range(size)]
# Create circular barplot with labels
fig, ax = plt.subplots(figsize=(16, 16), subplot_kw={"projection": "polar"})
ax.set_theta_offset(OFFSET)
ax.set_ylim(-VALUES.max(), VALUES.max())
ax.set_frame_on(False)
ax.xaxis.grid(False)
ax.yaxis.grid(False)
ax.set_xticks([])
ax.set_yticks([])
ax.bar(
ANGLES[IDXS],
VALUES,
width=WIDTH,
color=COLORS,
edgecolor="white",
linewidth=2,
)
_add_labels(ANGLES[IDXS], VALUES, LABELS, OFFSET, ax)
# This iterates over the sizes of the groups adding reference
# lines and annotations.
offset = 0
rotation = 0
for group, size in zip(BASE_LABELS, GROUPS_SIZE):
# Calculate positon for line below bars
x1 = np.linspace(ANGLES[offset + PAD], ANGLES[offset + size + PAD - 1], num=50)
# Plot line below bars
ax.plot(x1, [-5] * 50, color="#333333")
# calculate position for reference lines and group text
x2 = np.linspace(ANGLES[offset], ANGLES[offset + PAD - 1], num=50)
# Add text with time to indicate group
ax.text(
np.mean(x2),
-150,
group,
color="#333333",
fontsize=10,
fontweight="bold",
ha="center",
va="center",
rotation=rotation,
)
ax.plot(x2, [0] * 50, color="#333333", lw=0.8)
# Add reference lines at 20, 40, 60, and 80
for position in [0.2, 0.4, 0.6, 0.8, 1]:
ax.plot(x2, [VALUES.max() * position] * 50, color="#bebebe", lw=0.8)
offset += size + PAD
rotation += 15
else:
for position in [0.2, 0.4, 0.6, 0.8, 1]:
ax.text(
np.mean(x2),
VALUES.max() * position + 40,
round(VALUES.max() * position),
color="#bebebe",
fontsize=10,
ha="center",
va="center",
rotation=-15,
)
return fig, ax
def plot_tweet_metrics(tweet_metrics: TweetMetrics) -> Tuple[Figure, Axes]:
"""
Plot number of tweets (in millions) for each day in the tweet metrics.
"""
fig, ax = plt.subplots()
(tweet_metrics.to_pandas() / 1000000).plot(ax=ax)
ax.set_xlabel("Timestamp")
ax.set_ylabel("Number of tweets (millions)")
return fig, ax
def _get_label_rotation(angle, offset):
"""helper function to calculate label rotation
and alignment for circular bar plot
"""
# Rotation must be specified in degrees
rotation = np.rad2deg(angle + offset)
if angle <= 1.5 * np.pi:
alignment = "right"
rotation = rotation + 180
else:
alignment = "left"
return rotation, alignment
def _add_labels(angles, values, labels, offset, ax) -> None:
"""helper function to add lables to a circular bar plot"""
# This is the space between the end of the bar and the label
padding = 4
# Iterate over angles, values, and labels, to add all of them.
for (
angle,
value,
label,
) in zip(angles, values, labels):
angle = angle
# Obtain text rotation and alignment
rotation, alignment = _get_label_rotation(angle, offset)
# And finally add the text
ax.text(
x=angle,
y=value + padding,
s=label,
ha=alignment,
va="center",
rotation=rotation,
rotation_mode="anchor",
)