forked from evelinag/StarWars-social-network
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parseScripts.fs
203 lines (177 loc) · 7.35 KB
/
parseScripts.fs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
module StarWars.ParseScripts
open FSharp.Data
open System
open System.IO
open System.Text.RegularExpressions
let scriptUrls =
[ "Episode I: The Phantom Menace", "http://www.imsdb.com/scripts/Star-Wars-The-Phantom-Menace.html"
"Episode II: Attack of the Clones", "http://www.imsdb.com/scripts/Star-Wars-Attack-of-the-Clones.html"
"Episode III: Revenge of the Sith", "http://www.imsdb.com/scripts/Star-Wars-Revenge-of-the-Sith.html"
"Episode IV: A New Hope", "http://www.imsdb.com/scripts/Star-Wars-A-New-Hope.html"
"Episode V: The Empire Strikes Back", "http://www.imsdb.com/scripts/Star-Wars-The-Empire-Strikes-Back.html"
"Episode VI: Return of the Jedi", "http://www.imsdb.com/scripts/Star-Wars-Return-of-the-Jedi.html"
"Episode VII: The Force Awakens", "http://www.imsdb.com/scripts/Star-Wars-The-Force-Awakens.html"
]
// load film script from url
// script is either inside <td class="srctext"></td>
// or inside <pre></pre>
let getScriptElement (url:string) =
let extract_td (node:HtmlNode) =
if node.Name() <> "td" then false
else
match node.TryGetAttribute "class" with
| None -> false
| Some(t) -> t.Value() = "scrtext"
let scriptPage =
HtmlDocument.Load(url).Descendants("pre")
if Seq.isEmpty scriptPage then
HtmlDocument.Load(url).Descendants(extract_td) |> Seq.head
else scriptPage |> Seq.head
// Active pattern to parse the contents of the script
let (|SceneTitle|Name|Word|) (text:string) =
let scenePattern = "[ 0-9]*(INT.|EXT.)[ A-Z0-9]"
let namePattern = "^[/A-Z0-9]+[-]*[/A-Z0-9 ]*[-]*[/A-Z0-9 ]+$"
if Regex.Match(text, scenePattern).Success then
SceneTitle text
elif Regex.Match(text, namePattern).Success then
Name text
else Word
let (|MultipleNames|_|) (text:string) =
let namePattern = "[/A-Z0-9 -]+ *:"
let results = Regex.Matches(text,namePattern)
if results.Count > 0 then
let names =
[ for r in results -> r.Value.Trim(": ".ToCharArray())]
Some names
else
None
/// Recursively parse the script, extract the character names for each scene
let rec parseScenes sceneAcc characterAcc (items: string list) =
match items with
| item::rest ->
match item with
| SceneTitle title ->
// add the finished scene to the scene accumulator
let fullScene = List.rev characterAcc
parseScenes (fullScene::sceneAcc) [] rest
| Name name ->
// add character's name to the character accumulator
parseScenes sceneAcc (name::characterAcc) rest
| Word -> // do nothing
parseScenes sceneAcc characterAcc rest
| [] -> List.rev sceneAcc
/// Alternative function for parsing the script file
let getAlternativeNames text =
match text with
| MultipleNames names -> names
| _ -> []
let rec splitAlternativeScript sceneAcc (sceneTitles: string list) (text:string) =
match sceneTitles with
| scene :: rest ->
let idx = text.IndexOf(scene)
let currentScene = text.[0..idx-1]
let characters =
match currentScene with
| MultipleNames names -> names
| _ -> []
splitAlternativeScript (characters::sceneAcc) rest text.[idx + scene.Length ..]
| [] ->
// return the list of scenes, split into individual words
List.rev sceneAcc
let isSceneTitle (text:string) = text.Contains("INT.") || text.Contains("EXT.")
// let episode = getScriptElement htmlScripts.[2] // <- problematic episodes: 0
// let episode = getScriptElement htmlScripts.[6]
// Parse the scripts
let getCharactersByScene episodeUrl =
let episodeHtml = getScriptElement episodeUrl
// extract all script elements in bold
// this works for MOST episodes
let bItems =
episodeHtml.Elements("b")
|> List.map (fun x -> x.InnerText().Trim())
let charactersByScene =
parseScenes [] [] bItems
|> List.filter (fun characters -> not characters.IsEmpty)
// this still contains some other stuff -> filter by list of characters
let nSpeaking = List.concat charactersByScene |> List.length
let characters =
// check if the characters were extracted correctly
if (float nSpeaking)/(float bItems.Length) >= 0.2 then
charactersByScene
else
// bItems contains just scene breaks - use them to split the screenplay
// and extract the character names
let text = episodeHtml.ToString()
let sceneTitles = bItems |> List.filter isSceneTitle
splitAlternativeScript [] sceneTitles text
characters
|> List.map (List.distinct >> Array.ofList)
|> Array.ofList
// split the script by scene
// each scene starts with either INT. or EXT.
let rec splitByScene (script : string[]) scenes =
let scenePattern = "<b>[ 0-9]*(INT.|EXT.)"
let idx =
script
|> Seq.tryFindIndex (fun line -> Regex.Match(line, scenePattern).Success)
match idx with
| Some i ->
let remainingScenes = script.[i+1 ..]
let currentScene = script.[0..i-1]
splitByScene remainingScenes (currentScene :: scenes)
| None -> script :: scenes
// Some characters have multiple names - map their names onto pre-defined values
// specified in 'aliases.csv'
[<Literal>]
let aliasFile = __SOURCE_DIRECTORY__ + "/data/aliases.csv"
type Aliases = CsvProvider<aliasFile>
open Microsoft.FSharp.Reflection
/// Dictinary for translating character names between aliases
let aliasDict episodeIdx =
Aliases.Load(aliasFile).Rows
|> Seq.choose (fun row ->
// extract contents of a tuple as an array of obj values
if (FSharpValue.GetTupleFields(row).[episodeIdx + 2]) :?> bool
then Some (row.Alias, row.Name)
else None)
|> dict
let aliasesForEpisodes = Array.init scriptUrls.Length aliasDict
/// Some characters have multiple names - map their names onto pre-defined values
let mapName episodeIdx name =
if aliasesForEpisodes.[episodeIdx].ContainsKey(name) then
aliasesForEpisodes.[episodeIdx].[name]
else name
/// Script for Episode 7 contains a log of expressive terms that are not characters
let filterClutterTerms names =
let expressions = ["S POV"; "CONTINUED"; "CUT TO"; "WIDE SHOT"; "HIM"] |> set
names |> Array.filter (fun name -> not(expressions.Contains name))
/// Filter characters with names that also appear as general words in other episodes
let characterCheck episodeIdx name =
match episodeIdx, name with
| 6, "mace" -> false // MACE = MACE WINDU vs MACE = MACE wielding stormtrooper
| _ -> true
///==============================================================================
/// Add colours to specific characters
let getCharacterColour name =
match name with
| "ANAKIN" -> "#ce3b59"
| "DARTH VADER" -> "#000000"
| "LUKE" -> "#3881e5"
| "OBI-WAN" -> "#48D1CC"
| "C-3PO" -> "#FFD700"
| "R2-D2" -> "#bde0f6"
| "CHEWBACCA" -> "#A0522D"
| "HAN" -> "#ff9400"
| "LEIA" -> "#DCDCDC"
| "QUI-GON" -> "#4f4fb1"
| "EMPEROR" -> "#191970"
| "YODA" -> "#9ACD32"
| "PADME" -> "#DDA0DD"
| "JAR JAR" -> "#9a9a00"
| "REY" -> "#ffe0af"
| "KYLO REN" -> "#000000"
| "SNOKE" -> "#191970"
| "FINN" -> "#07b19f"
| "POE" -> "#a15bea"
| "BB-8" -> "#eb5d00"
| _ -> "#808080"