Source code for twixl.collections.twinl.plotting

# nopycln: file
from typing import Tuple, List

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from wordcloud import WordCloud
from typing import Optional

from twixl.collections.twinl import SearchResults, WordFrequencyResults, TweetMetrics


[docs]def plot_tweet_frequencies( search_results: SearchResults, num_xticks: int = 5, title: str = "Number of tweets per day", ) -> Tuple[Figure, Axes]: """Plot the TwiXL Query result in a frequency plot. :param search_results: Twinl search results. :param num_xticks: :param title (Optional): The figure title. :return: Tweet frequency figure. Usage:: >>> from twixl.collections import twinl >>> twinl.plotting.plot_tweet_frequencies( >>> search_results, >>> title="Number of 'Elfstedentocht' tweets per day" >>> ) (<Figure>, <AxesSubplot>) """ # Group by day, count number of tweets df = search_results.to_pandas() df["timestamp_bin"] = df["timestamp"].dt.to_period("d") counts = df.groupby("timestamp_bin").count() fig, ax = plt.subplots() num_rows = counts.shape[0] ax.bar(np.arange(num_rows), counts.timestamp, width=1) xticks = np.linspace(0, 1, num_xticks) * num_rows xtick_labels = [ counts.index[i].strftime("%d-%m-%Y") for i in np.linspace(0, num_rows - 1, num_xticks, dtype=int) ] ax.set_xticks(xticks) ax.set_xticklabels(xtick_labels, rotation=45, ha="right") ax.set_title(title) ax.set_ylabel("Number of tweets") return fig, ax
[docs]def plot_word_cloud( word_frequency_results: WordFrequencyResults, width: int = 800, height: int = 400, max_words: int = 200, stopwords: Optional[List[str]] = None, background_color: str = "white", min_word_length: int = 0, ) -> Figure: """Plots the word-frequency list as a wordcloud. :param word_frequency_results: Word frequency results. :param width: Width of the canvas. :param height: Height of the canvas. :param max_words: The maximum number of words in the wordcloud. :param stopwords: A list of stopwords that should be filtered from the wordcloud. :param background_color: Background color for the word cloud image. :param min_word_length: Minimum number of letters a word must have to be included. :return: Word cloud plot Usage:: >>> from twixl.collections import twinl >>> twinl.plotting.plot_word_cloud( >>> word_frequencies, >>> stopwords=stopwords, >>> max_words=100, >>> min_word_length=4 >>> ); <matplotlib.image.AxesImage> """ df = word_frequency_results.to_pandas(by_hour=False) if stopwords: # Filter stopwords from word frequency dataframe. df = df[~df["word"].isin(stopwords)] data = df[df["frequency"] > 1000].set_index("word").to_dict()["frequency"] # type: ignore wc = WordCloud( width=width, height=height, max_words=max_words, background_color=background_color, stopwords=stopwords, min_word_length=min_word_length, ).generate_from_frequencies(data) # Set figure size # TODO: This should be configurable? plt.figure() # No axis details plt.axis("off") return plt.imshow(wc)
[docs]def plot_circular_bars( word_frequency_results: WordFrequencyResults, stopwords: Optional[List[str]] = None, group_size: int = 5, ) -> Tuple[Figure, Axes]: """Plots the word-frequency list as a circular bar plot. :param stopwords: A list of stopwords that should be filtered from the wordcloud. :param group_size: The number of bars per hour. Usage:: >>> from twixl.collections import twinl >>> twinl.plotting.plot_circular_bars( >>> word_frequencies, >>> stopwords=stopwords, >>> group_size=3 >>> ); (<Figure>, <PolarAxesSubplot>) """ df = word_frequency_results.to_pandas(by_hour=True) if stopwords: # Filter stopwords from word frequency dataframe. df = df[~df["word"].isin(stopwords)] GROUPS_SIZE = [group_size] * 24 df = df.groupby("hour").head(group_size).sort_values(by=["hour", "frequency"]) # Raise an error if we do not have enough words per day to plot based on the group size # TODO: if we want to keep this method around, fix the plotting instead of raising this error min_words_per_day = df.groupby("hour").count()["frequency"].min() if min_words_per_day < group_size: raise ValueError( f"maximum group size based on provided data is {min_words_per_day}, but {group_size} expected, please lower group size" ) # Reindex/reverse arrays to plot bars clockwise VALUES = df["frequency"].reindex(index=df["frequency"].index[::-1]) LABELS = df["word"].reindex(index=df["word"].index[::-1]) BASE_LABELS = [f"{i:02d}:00" for i in range(1, 24)] BASE_LABELS.append("00:00") BASE_LABELS.reverse() GROUP = df["hour"].values OFFSET = 0 PAD = 3 ANGLES_N = len(VALUES) + PAD * len(np.unique(GROUP)) # type: ignore ANGLES = np.linspace(0, 2 * np.pi, num=ANGLES_N, endpoint=False) ANGLES = ANGLES + 0.5 * np.pi WIDTH = (2 * np.pi) / len(ANGLES) offset = OFFSET IDXS = [] for size in GROUPS_SIZE: IDXS += list(range(offset + PAD, offset + size + PAD)) offset += size + PAD COLORS = [f"C{i}" for i, size in enumerate(GROUPS_SIZE) for _ in range(size)] # Create circular barplot with labels fig, ax = plt.subplots(figsize=(16, 16), subplot_kw={"projection": "polar"}) ax.set_theta_offset(OFFSET) ax.set_ylim(-VALUES.max(), VALUES.max()) ax.set_frame_on(False) ax.xaxis.grid(False) ax.yaxis.grid(False) ax.set_xticks([]) ax.set_yticks([]) ax.bar( ANGLES[IDXS], VALUES, width=WIDTH, color=COLORS, edgecolor="white", linewidth=2, ) _add_labels(ANGLES[IDXS], VALUES, LABELS, OFFSET, ax) # This iterates over the sizes of the groups adding reference # lines and annotations. offset = 0 rotation = 0 for group, size in zip(BASE_LABELS, GROUPS_SIZE): # Calculate positon for line below bars x1 = np.linspace(ANGLES[offset + PAD], ANGLES[offset + size + PAD - 1], num=50) # Plot line below bars ax.plot(x1, [-5] * 50, color="#333333") # calculate position for reference lines and group text x2 = np.linspace(ANGLES[offset], ANGLES[offset + PAD - 1], num=50) # Add text with time to indicate group ax.text( np.mean(x2), -150, group, color="#333333", fontsize=10, fontweight="bold", ha="center", va="center", rotation=rotation, ) ax.plot(x2, [0] * 50, color="#333333", lw=0.8) # Add reference lines at 20, 40, 60, and 80 for position in [0.2, 0.4, 0.6, 0.8, 1]: ax.plot(x2, [VALUES.max() * position] * 50, color="#bebebe", lw=0.8) offset += size + PAD rotation += 15 else: for position in [0.2, 0.4, 0.6, 0.8, 1]: ax.text( np.mean(x2), VALUES.max() * position + 40, round(VALUES.max() * position), color="#bebebe", fontsize=10, ha="center", va="center", rotation=-15, ) return fig, ax
def plot_tweet_metrics(tweet_metrics: TweetMetrics) -> Tuple[Figure, Axes]: """ Plot number of tweets (in millions) for each day in the tweet metrics. """ fig, ax = plt.subplots() (tweet_metrics.to_pandas() / 1000000).plot(ax=ax) ax.set_xlabel("Timestamp") ax.set_ylabel("Number of tweets (millions)") return fig, ax def _get_label_rotation(angle, offset): """helper function to calculate label rotation and alignment for circular bar plot """ # Rotation must be specified in degrees rotation = np.rad2deg(angle + offset) if angle <= 1.5 * np.pi: alignment = "right" rotation = rotation + 180 else: alignment = "left" return rotation, alignment def _add_labels(angles, values, labels, offset, ax) -> None: """helper function to add lables to a circular bar plot""" # This is the space between the end of the bar and the label padding = 4 # Iterate over angles, values, and labels, to add all of them. for ( angle, value, label, ) in zip(angles, values, labels): angle = angle # Obtain text rotation and alignment rotation, alignment = _get_label_rotation(angle, offset) # And finally add the text ax.text( x=angle, y=value + padding, s=label, ha=alignment, va="center", rotation=rotation, rotation_mode="anchor", )