scom-gpols-nlp

230912: data analysis

import numpy as np
import pandas as pd
#pd.set_option("display.max_rows", 10)
#from tabulate import tabulate
import siuba as si
import plotnine as p9
import matplotlib.pyplot as plt
%matplotlib inline
import re
from pprint import pprint

# plotnine theme
some_theme = p9.theme_dark() + p9.theme(
    text=p9.element_text(face="bold", size=20),
    plot_background=p9.element_rect(fill='gray', colour='black')
)
# read csv
#df = pd.read_csv("~/dev/ccg-web/csv/scom-gpols.csv", sep="\t", header=0)
#df = pd.read_csv("scom-gpols.csv", sep="\t", header=0)
df = pd.read_csv("https://raw.githubusercontent.com/nils-holmberg/ccg-web/main/csv/scom-gpols.csv", sep="\t", header=0)

#list(df.columns)
df.head()
image id text
0 aline_lessner-higher_education-4185-7 4185 Several Swedish universities place high in Eur...
1 aline_lessner-students-4782-7 4782 Several Swedish universities place high in Eur...
2 aline_lessner-students-4783-7 4783 Several Swedish universities place high in Eur...
3 anna_sigge-fashion_from_forests-6404-7 6404 Knitted dress made of 100 per cent paper from ...
4 ann-sofi_rosenkvist-children_on_bikes-4910-7 4910 Learning to ride a bike is something you'...
from itertools import chain
# Create a new DataFrame to store the separated sentences and corresponding IDs
new_df = pd.DataFrame(columns=['id', 'sentence_number', 'sentence'])
# Iterate through each row in the original DataFrame
for index, row in df.iterrows():
    # Split the text into sentences
    sentences = str(row['text']).split('. ')
    # Create a list of sentence numbers
    sentence_numbers = list(range(1, len(sentences) + 1))
    # Create a list of IDs corresponding to each sentence
    ids = [row['id']] * len(sentences)
    # Create a temporary DataFrame
    temp_df = pd.DataFrame({'id': ids, 'sentence_number': sentence_numbers, 'sentence': sentences})
    # Append the temporary DataFrame to the new DataFrame
    new_df = pd.concat([new_df, temp_df], ignore_index=True)
# Display the new DataFrame
new_df.head()
id sentence_number sentence
0 4185 1 Several Swedish universities place high in Eur...
1 4185 2 Many Master's programmes are taught in En...
2 4185 3 More than 20,000 foreign students are studying...
3 4185 4 A good part of their social life revolves arou...
4 4185 5 The union can often help you with finding acco...
# Create a new column to store whether the sentence contains the substring 'swe' (case-insensitive)
new_df['contains_swe'] = new_df['sentence'].apply(lambda x: 1 if 'swe' in x.lower() else 0)
# Display the updated DataFrame
new_df.head()
id sentence_number sentence contains_swe
0 4185 1 Several Swedish universities place high in Eur... 1
1 4185 2 Many Master's programmes are taught in En... 0
2 4185 3 More than 20,000 foreign students are studying... 1
3 4185 4 A good part of their social life revolves arou... 0
4 4185 5 The union can often help you with finding acco... 0

230908: testing

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 10)
from tabulate import tabulate
import siuba as si
import plotnine as p9
import matplotlib.pyplot as plt
%matplotlib inline

# plotnine theme
some_theme = p9.theme_dark() + p9.theme(
    text=p9.element_text(face="bold", size=20),
    plot_background=p9.element_rect(fill='gray', colour='black')
)

# read csv
df = pd.read_csv("~/dev/ccg-web/csv/scom-gpols.csv", sep="\t", header=0)

#list(df.columns)
df.head()
image id text
0 aline_lessner-higher_education-4185-7 4185 Several Swedish universities place high in Eur...
1 aline_lessner-students-4782-7 4782 Several Swedish universities place high in Eur...
2 aline_lessner-students-4783-7 4783 Several Swedish universities place high in Eur...
3 anna_sigge-fashion_from_forests-6404-7 6404 Knitted dress made of 100 per cent paper from ...
4 ann-sofi_rosenkvist-children_on_bikes-4910-7 4910 Learning to ride a bike is something you'...

matplotlib