Jupyter at Bryn Mawr College |
|||
Public notebooks: /services/public/dblank / Experiments / Debate1 |
This is a Jupyter notebook detailing an analysis of the language used in the first debate.
Prepared by Doug Blank, Bryn Mawr College
For full discussion, see: http://blankversusblank.blogspot.com/2016/09/post-debate-analysis.html
Data from http://www.nytimes.com/2016/09/27/us/politics/transcript-debate.html
There were some errors that I corrected, so you can use the version here first_debate.txt.
First, we read the text into an array of lines:
text = [line.strip().replace("\n", " ").replace(".", " ").replace("?", " ")
.replace("“", " ").replace("”", " ").replace(":", " ")
.replace(",", " ").replace("—", " ").replace("-", " ")
for line in open("first_debate.txt").readlines()]
text_all = " ".join(text)
A sample to see what it looks like:
text[0]
Now, we break it down by speaker:
holt = ""
clinton = ""
trump = ""
current = None
for line in text:
if not line:
continue
elif line in ["(APPLAUSE)", "(CROSSTALK)", "(LAUGHTER)"]:
continue
elif line.startswith("HOLT"):
current = "HOLT"
holt += line[4:] + " "
elif line.startswith("TRUMP"):
current = "TRUMP"
trump += line[5:] + " "
elif line.startswith("CLINTON"):
current = "CLINTON"
clinton += line[7:] + " "
else:
if current == "HOLT":
holt += line + " "
elif current == "TRUMP":
trump += line + " "
elif current == "CLINTON":
clinton += line + " "
else:
raise Exception("No speaker?!")
holt = holt.lower()
clinton = clinton.lower()
trump = trump.lower()
clinton = clinton.strip()
while " " in clinton:
clinton = clinton.replace(" ", " ")
holt = holt.strip()
while " " in holt:
holt = holt.replace(" ", " ").strip()
trump = trump.strip()
while " " in trump:
trump = trump.replace(" ", " ").strip()
len(holt), len(trump), len(clinton)
And split the text into words:
clinton_words = clinton.split(" ")
trump_words = trump.split(" ")
holt_words = holt.split(" ")
len(clinton_words), len(trump_words), len(holt_words)
clinton_set = set(clinton_words)
trump_set = set(trump_words)
holt_set = set(holt_words)
len(clinton_set), len(trump_set)
def make_dict(words):
d = {}
for word in words:
count = d.get(word, 0)
d[word] = count + 1
return d
clinton_dict = make_dict(clinton_words)
trump_dict = make_dict(trump_words)
common_words = ["the", "to", "and", "or", "that", "of", "a", "in", "have", "it", "be",
"am", "are", "was", "were", "been", "be", "being", "is", "do", "would",
"but", "what", "so", "with", "about", "at", "on", "has", "can", "as",
"because", "when", "by", "an", "for", "this"]
for pair in sorted([items for items in clinton_dict.items() if items[0] not in common_words and
items[1] > 2],
key=lambda pair: pair[1], reverse=True):
print("%s: %s" % pair)
clinton_dict["china"], clinton_dict["plan"]
trump_dict["china"], trump_dict["plan"]
for pair in sorted([items for items in trump_dict.items() if items[0] not in common_words and
items[1] > 2],
key=lambda pair: pair[1], reverse=True):
print("%s: %s" % pair)
text_all.count("TRUMP")
text_all.count("CLINTON")