scom-gpols-nlp

230912: data analysis

import numpy as np
import pandas as pd
#pd.set_option("display.max_rows", 10)
#from tabulate import tabulate
import siuba as si
import plotnine as p9
import matplotlib.pyplot as plt
%matplotlib inline
import re
from pprint import pprint

# plotnine theme
some_theme = p9.theme_dark() + p9.theme(
    text=p9.element_text(face="bold", size=20),
    plot_background=p9.element_rect(fill='gray', colour='black')
)

# read csv
#df = pd.read_csv("~/dev/ccg-web/csv/scom-gpols.csv", sep="\t", header=0)
#df = pd.read_csv("scom-gpols.csv", sep="\t", header=0)
df = pd.read_csv("https://raw.githubusercontent.com/nils-holmberg/ccg-web/main/csv/scom-gpols.csv", sep="\t", header=0)

#list(df.columns)
df.head()

	image	id	text
0	aline_lessner-higher_education-4185-7	4185	Several Swedish universities place high in Eur...
1	aline_lessner-students-4782-7	4782	Several Swedish universities place high in Eur...
2	aline_lessner-students-4783-7	4783	Several Swedish universities place high in Eur...
3	anna_sigge-fashion_from_forests-6404-7	6404	Knitted dress made of 100 per cent paper from ...
4	ann-sofi_rosenkvist-children_on_bikes-4910-7	4910	Learning to ride a bike is something you'...

from itertools import chain
# Create a new DataFrame to store the separated sentences and corresponding IDs
new_df = pd.DataFrame(columns=['id', 'sentence_number', 'sentence'])
# Iterate through each row in the original DataFrame
for index, row in df.iterrows():
    # Split the text into sentences
    sentences = str(row['text']).split('. ')
    # Create a list of sentence numbers
    sentence_numbers = list(range(1, len(sentences) + 1))
    # Create a list of IDs corresponding to each sentence
    ids = [row['id']] * len(sentences)
    # Create a temporary DataFrame
    temp_df = pd.DataFrame({'id': ids, 'sentence_number': sentence_numbers, 'sentence': sentences})
    # Append the temporary DataFrame to the new DataFrame
    new_df = pd.concat([new_df, temp_df], ignore_index=True)
# Display the new DataFrame
new_df.head()

	id	sentence_number	sentence
0	4185	1	Several Swedish universities place high in Eur...
1	4185	2	Many Master's programmes are taught in En...
2	4185	3	More than 20,000 foreign students are studying...
3	4185	4	A good part of their social life revolves arou...
4	4185	5	The union can often help you with finding acco...

# Create a new column to store whether the sentence contains the substring 'swe' (case-insensitive)
new_df['contains_swe'] = new_df['sentence'].apply(lambda x: 1 if 'swe' in x.lower() else 0)
# Display the updated DataFrame
new_df.head()

	id	sentence_number	sentence	contains_swe
0	4185	1	Several Swedish universities place high in Eur...	1
1	4185	2	Many Master's programmes are taught in En...	0
2	4185	3	More than 20,000 foreign students are studying...	1
3	4185	4	A good part of their social life revolves arou...	0
4	4185	5	The union can often help you with finding acco...	0

230908: testing

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 10)
from tabulate import tabulate
import siuba as si
import plotnine as p9
import matplotlib.pyplot as plt
%matplotlib inline

# plotnine theme
some_theme = p9.theme_dark() + p9.theme(
    text=p9.element_text(face="bold", size=20),
    plot_background=p9.element_rect(fill='gray', colour='black')
)

# read csv
df = pd.read_csv("~/dev/ccg-web/csv/scom-gpols.csv", sep="\t", header=0)

#list(df.columns)
df.head()

	image	id	text
0	aline_lessner-higher_education-4185-7	4185	Several Swedish universities place high in Eur...
1	aline_lessner-students-4782-7	4782	Several Swedish universities place high in Eur...
2	aline_lessner-students-4783-7	4783	Several Swedish universities place high in Eur...
3	anna_sigge-fashion_from_forests-6404-7	6404	Knitted dress made of 100 per cent paper from ...
4	ann-sofi_rosenkvist-children_on_bikes-4910-7	4910	Learning to ride a bike is something you'...