import numpy as np
import pandas as pd
#pd.set_option("display.max_rows", 10)
#from tabulate import tabulate
import siuba as si
import plotnine as p9
import matplotlib.pyplot as plt
%matplotlib inline
import re
from pprint import pprint
# plotnine theme
= p9.theme_dark() + p9.theme(
some_theme =p9.element_text(face="bold", size=20),
text=p9.element_rect(fill='gray', colour='black')
plot_background )
scom-gpols-nlp
230912: data analysis
# read csv
#df = pd.read_csv("~/dev/ccg-web/csv/scom-gpols.csv", sep="\t", header=0)
#df = pd.read_csv("scom-gpols.csv", sep="\t", header=0)
= pd.read_csv("https://raw.githubusercontent.com/nils-holmberg/ccg-web/main/csv/scom-gpols.csv", sep="\t", header=0)
df
#list(df.columns)
df.head()
image | id | text | |
---|---|---|---|
0 | aline_lessner-higher_education-4185-7 | 4185 | Several Swedish universities place high in Eur... |
1 | aline_lessner-students-4782-7 | 4782 | Several Swedish universities place high in Eur... |
2 | aline_lessner-students-4783-7 | 4783 | Several Swedish universities place high in Eur... |
3 | anna_sigge-fashion_from_forests-6404-7 | 6404 | Knitted dress made of 100 per cent paper from ... |
4 | ann-sofi_rosenkvist-children_on_bikes-4910-7 | 4910 | Learning to ride a bike is something you'... |
from itertools import chain
# Create a new DataFrame to store the separated sentences and corresponding IDs
= pd.DataFrame(columns=['id', 'sentence_number', 'sentence'])
new_df # Iterate through each row in the original DataFrame
for index, row in df.iterrows():
# Split the text into sentences
= str(row['text']).split('. ')
sentences # Create a list of sentence numbers
= list(range(1, len(sentences) + 1))
sentence_numbers # Create a list of IDs corresponding to each sentence
= [row['id']] * len(sentences)
ids # Create a temporary DataFrame
= pd.DataFrame({'id': ids, 'sentence_number': sentence_numbers, 'sentence': sentences})
temp_df # Append the temporary DataFrame to the new DataFrame
= pd.concat([new_df, temp_df], ignore_index=True)
new_df # Display the new DataFrame
new_df.head()
id | sentence_number | sentence | |
---|---|---|---|
0 | 4185 | 1 | Several Swedish universities place high in Eur... |
1 | 4185 | 2 | Many Master's programmes are taught in En... |
2 | 4185 | 3 | More than 20,000 foreign students are studying... |
3 | 4185 | 4 | A good part of their social life revolves arou... |
4 | 4185 | 5 | The union can often help you with finding acco... |
# Create a new column to store whether the sentence contains the substring 'swe' (case-insensitive)
'contains_swe'] = new_df['sentence'].apply(lambda x: 1 if 'swe' in x.lower() else 0)
new_df[# Display the updated DataFrame
new_df.head()
id | sentence_number | sentence | contains_swe | |
---|---|---|---|---|
0 | 4185 | 1 | Several Swedish universities place high in Eur... | 1 |
1 | 4185 | 2 | Many Master's programmes are taught in En... | 0 |
2 | 4185 | 3 | More than 20,000 foreign students are studying... | 1 |
3 | 4185 | 4 | A good part of their social life revolves arou... | 0 |
4 | 4185 | 5 | The union can often help you with finding acco... | 0 |
230908: testing
import numpy as np
import pandas as pd
"display.max_rows", 10)
pd.set_option(from tabulate import tabulate
import siuba as si
import plotnine as p9
import matplotlib.pyplot as plt
%matplotlib inline
# plotnine theme
= p9.theme_dark() + p9.theme(
some_theme =p9.element_text(face="bold", size=20),
text=p9.element_rect(fill='gray', colour='black')
plot_background
)
# read csv
= pd.read_csv("~/dev/ccg-web/csv/scom-gpols.csv", sep="\t", header=0)
df
#list(df.columns)
df.head()
image | id | text | |
---|---|---|---|
0 | aline_lessner-higher_education-4185-7 | 4185 | Several Swedish universities place high in Eur... |
1 | aline_lessner-students-4782-7 | 4782 | Several Swedish universities place high in Eur... |
2 | aline_lessner-students-4783-7 | 4783 | Several Swedish universities place high in Eur... |
3 | anna_sigge-fashion_from_forests-6404-7 | 6404 | Knitted dress made of 100 per cent paper from ... |
4 | ann-sofi_rosenkvist-children_on_bikes-4910-7 | 4910 | Learning to ride a bike is something you'... |