Architecture Note: NFL Reddit sentiment analysis pipeline using NLTK for text preprocessing, tokenization, stopword removal, and lemmatization to prepare data for sentiment classification models.
%%{init: {'theme': 'dark', 'themeVariables': { 'primaryColor': '#C17852', 'primaryTextColor': '#F0F6FC', 'primaryBorderColor': '#4A5E32', 'lineColor': '#E6C98F', 'secondaryColor': '#161B22', 'tertiaryColor': '#0D1117', 'background': '#0D1117', 'mainBkg': '#161B22', 'nodeBorder': '#4A5E32', 'clusterBkg': '#161B22', 'clusterBorder': '#4A5E32', 'titleColor': '#E6C98F', 'edgeLabelBackground': '#161B22'}}}%%
flowchart LR
subgraph Source["๐ฅ Data Source"]
CSV["๐ NFL_reddit_data.csv
8,353 comments"]
end
subgraph Load["โ๏ธ Pandas Loading"]
DF["๐ DataFrame Creation"]
PR["๐ Parse Columns"]
end
subgraph Extract["๐ง Data Extraction"]
PL["๐ Player Names"]
TX["๐ฌ Comment Text"]
MT["๐ Metadata
Score, Flair"]
end
subgraph Output["๐ค Raw Data"]
RAW["๐ Raw DataFrame"]
end
CSV --> DF
DF --> PR
PR --> PL
PR --> TX
PR --> MT
PL --> RAW
TX --> RAW
MT --> RAW
%%{init: {'theme': 'dark', 'themeVariables': { 'primaryColor': '#C17852', 'primaryTextColor': '#F0F6FC', 'primaryBorderColor': '#4A5E32', 'lineColor': '#E6C98F', 'secondaryColor': '#161B22', 'tertiaryColor': '#0D1117', 'background': '#0D1117', 'mainBkg': '#161B22', 'nodeBorder': '#4A5E32', 'clusterBkg': '#161B22', 'clusterBorder': '#4A5E32', 'titleColor': '#E6C98F', 'edgeLabelBackground': '#161B22'}}}%%
flowchart LR
subgraph Input["๐ฅ Input"]
RAW["๐ฌ Raw Comment Text"]
end
subgraph NLTK["๐ง NLTK Processing"]
TOK["โ๏ธ word_tokenize()"]
SEG["๐ Sentence Segments"]
end
subgraph ASCII["๐ค Non-ASCII Removal"]
NRM["๐ unicodedata.normalize()
NFKD"]
ASC["โ
ASCII Only"]
end
subgraph Output["๐ค Output"]
TKS["๐ Token List"]
end
RAW --> TOK
TOK --> SEG
SEG --> NRM
NRM --> ASC
ASC --> TKS
%%{init: {'theme': 'dark', 'themeVariables': { 'primaryColor': '#C17852', 'primaryTextColor': '#F0F6FC', 'primaryBorderColor': '#4A5E32', 'lineColor': '#E6C98F', 'secondaryColor': '#161B22', 'tertiaryColor': '#0D1117', 'background': '#0D1117', 'mainBkg': '#161B22', 'nodeBorder': '#4A5E32', 'clusterBkg': '#161B22', 'clusterBorder': '#4A5E32', 'titleColor': '#E6C98F', 'edgeLabelBackground': '#161B22'}}}%%
flowchart TB
subgraph Input["๐ฅ Tokens"]
TK["๐ Token List"]
end
subgraph Lower["๐ค Lowercase"]
LC[".lower()"]
end
subgraph Punct["โ๏ธ Punctuation"]
RX["๐ Regex Filter"]
NP["๐ซ Remove Symbols"]
end
subgraph Stop["๐ซ Stopwords"]
SW["๐ NLTK Stopwords
English Corpus"]
FLT["๐ Filter Tokens"]
end
subgraph Output["๐ค Clean Tokens"]
CLN["โ
Normalized Words"]
end
TK --> LC
LC --> RX
RX --> NP
NP --> SW
SW --> FLT
FLT --> CLN
%%{init: {'theme': 'dark', 'themeVariables': { 'primaryColor': '#C17852', 'primaryTextColor': '#F0F6FC', 'primaryBorderColor': '#4A5E32', 'lineColor': '#E6C98F', 'secondaryColor': '#161B22', 'tertiaryColor': '#0D1117', 'background': '#0D1117', 'mainBkg': '#161B22', 'nodeBorder': '#4A5E32', 'clusterBkg': '#161B22', 'clusterBorder': '#4A5E32', 'titleColor': '#E6C98F', 'edgeLabelBackground': '#161B22'}}}%%
flowchart LR
subgraph Input["๐ฅ Clean Tokens"]
CT["๐ Normalized Words"]
end
subgraph Lemma["๐ง WordNet Lemmatizer"]
WNL["๐ WordNetLemmatizer()"]
POS["๐ท๏ธ POS='verb'"]
LEM[".lemmatize()"]
end
subgraph Examples["๐ Transformations"]
E1["running โ run"]
E2["celebrating โ celebrate"]
E3["played โ play"]
end
subgraph Output["๐ค Base Forms"]
BF["โ
Lemmatized Tokens"]
end
CT --> WNL
WNL --> POS
POS --> LEM
LEM --> E1
LEM --> E2
LEM --> E3
E1 --> BF
E2 --> BF
E3 --> BF
%%{init: {'theme': 'dark', 'themeVariables': { 'primaryColor': '#C17852', 'primaryTextColor': '#F0F6FC', 'primaryBorderColor': '#4A5E32', 'lineColor': '#E6C98F', 'secondaryColor': '#161B22', 'tertiaryColor': '#0D1117', 'background': '#0D1117', 'mainBkg': '#161B22', 'nodeBorder': '#4A5E32', 'clusterBkg': '#161B22', 'clusterBorder': '#4A5E32', 'titleColor': '#E6C98F', 'edgeLabelBackground': '#161B22'}}}%%
flowchart TB
subgraph Source["๐ฅ Data Source"]
CSV["๐ NFL Reddit CSV
8,353 Comments"]
end
subgraph Load["โ๏ธ Loading"]
PD["๐ผ Pandas DataFrame"]
end
subgraph Pipeline["๐ง normalize() Function"]
T1["1๏ธโฃ Tokenize"]
T2["2๏ธโฃ Remove Non-ASCII"]
T3["3๏ธโฃ Lowercase"]
T4["4๏ธโฃ Remove Punctuation"]
T5["5๏ธโฃ Remove Stopwords"]
T6["6๏ธโฃ Lemmatize"]
end
subgraph Apply["๐ DataFrame Apply"]
AP[".astype(str).apply(normalize)"]
CL["clean_text column"]
end
subgraph Ready["๐ค ML Ready"]
FV["๐ฏ Feature Vectors"]
SA["๐ Sentiment Analysis"]
TM["๐ Topic Modeling"]
end
CSV --> PD
PD --> T1
T1 --> T2
T2 --> T3
T3 --> T4
T4 --> T5
T5 --> T6
T6 --> AP
AP --> CL
CL --> FV
CL --> SA
CL --> TM