diff --git a/README.md b/README.md index 4ccb714..cf7e18d 100644 --- a/README.md +++ b/README.md @@ -67,3 +67,65 @@ Go to `featured-venues.json` and add to the bottom. Make sure that you are using - How do I deploy new changes? Create a new branch (`git checkout -b my-branch-name`), commit as normal, then push your branch and open a PR. When your change lands on `main` it will be deployed to the web automatically. + +- How do I add a new blog post? + +Create a markdown file under `static/blog-assets/posts`. The filename will be the slug in the URL. It can technically be anything, but the convention is `year-month-date-keyword.md` for consistency (e.g., `2015-12-17-next-steps.md`). In that file, write a post by taking the following steps: + +1. Sepcify meta data in YAML format, between `---` and `---`, at the top of your markdown post file. This is necessary for linking and sorting. + +```yaml +--- +date: 2017-10-31 # required. year-month-date, in numbers. +title: "Introducing Vega-Lite 2.0" # required. in double-quotes +banner: "../blog-assets/images/2017-10-31-vegalite2-banner.webp" # optional. if provided, it appears before the title. +paper: vega-lite # optional. if provided, it will create a link to the paper under the title (in the post page). +headliner: "..." # (1) optional, if your post is not external (i.e., there is content below this meta data section) and you want to have some custom summary for your post + # (2) required, if your post is external (i.e., the below `external' field is provided) + # For both cases, make sure it is about 100 letters for the layout purposes. +external: URL # if it is posted on an external blog, then just provide that url here. While you are still need to say something in the post for the parsing purposes (put something like "external post"), it will be ignored. When this field is provided, then the above "headliner" field is required. This will be checked when you run the test script. +--- +``` + +2. Write your post below the meta data. Use common markdown formatting options. Here are some special cases: + + a. Image caption: + +``` +![alt text](image url) +*your caption goes here.* +``` + + b. Horizontally placing images (the line changes are all intentional): + +``` +
+ +![](../blog-assets/images/image-1) + +![](../blog-assets/images/image-2) +
+ +*Your caption goes here.* +``` + + c. A display text: +``` +

Some text

+``` + + d. A quote (Note: markdown formatting does not work within a `
` tag, so any formatting, such as boldface or italic, must be specified using HTML): +``` +
+Some text italic and bold +
+``` + + e. A space divider (This will be rendered as a short, center-aligned horizontal line): +``` +* * * +``` + +3. Store images in `static/blog-assets/images` directory. For maintenence purposes, name your images starting with your post's file name. + +4. Supported headings `

` (`##`) and `

` (`###`). diff --git a/package.json b/package.json index 9d60411..b9a0570 100644 --- a/package.json +++ b/package.json @@ -56,6 +56,7 @@ "@types/d3": "^7.4.3", "d3": "^7.9.0", "d3-force": "^3.0.0", - "markdown-it": "^14.1.0" + "markdown-it": "^14.1.0", + "yaml": "^2.7.1" } } diff --git a/scripts/integrity-enforcement.ts b/scripts/integrity-enforcement.ts index 7e492f7..2aeb901 100644 --- a/scripts/integrity-enforcement.ts +++ b/scripts/integrity-enforcement.ts @@ -1,6 +1,8 @@ import fs from 'fs/promises'; import peopleRaw from '../static/people.json?raw'; -import type { Paper, Person } from '../src/lib/app-types'; +import type { BlogPost, Paper, Person } from '../src/lib/app-types'; +import { parsePostData, stripHTML } from "../src/lib/pasre-post"; +import markdownit from 'markdown-it' async function generateIndex() { const people = JSON.parse(peopleRaw) as Person[]; @@ -41,7 +43,39 @@ async function generateIndex() { await fs.writeFile('./static/papers-index.json', JSON.stringify(papers, null, 2)); } +async function generateBlogList() { + const postList = await fs.readdir('./static/blog-assets/posts'); + + const md = markdownit({ html: true, linkify: true }); + + const posts = [] as BlogPost[]; + for (const post of postList) { + const web_name = post.split(".").slice(0, -1).join("."); + const postRaw = await fs + .readFile(`./static/blog-assets/posts/${post}`, 'utf8') + .then((x) => parsePostData(x, web_name) as BlogPost); + const rendered_post = md.render(postRaw.post) + const summary = stripHTML(rendered_post).slice(0, 100) + const first_image = postRaw.first_image; + posts.push({ meta: postRaw.meta, post: summary, first_image }); + } + posts.sort((a, b) => { + const ad = a.meta.date; + const bd = b.meta.date; + const at = a.meta.title; + const bt = b.meta.title; + // sort by reverse mod date, break ties by alphabetic title order + return ad < bd ? 1 : ad > bd ? -1 : at < bt ? -1 : at > bt ? 1 : 0; + }); + posts.forEach((a, i) => { + a.meta.recent = (i < 5); + }) + await fs.writeFile('./static/blog-index.json', JSON.stringify(posts, null, 2)); +} + async function main() { await generateIndex(); + await generateBlogList() } + main(); diff --git a/src/data-integrity.test.ts b/src/data-integrity.test.ts index c28e27b..6ed71e3 100644 --- a/src/data-integrity.test.ts +++ b/src/data-integrity.test.ts @@ -1,5 +1,5 @@ import { expect, test } from 'vitest'; -import type { Paper, Person, Spotlight, FeaturedVenue, News, Venue, Course } from './lib/app-types'; +import type { Paper, Person, Spotlight, FeaturedVenue, News, Venue, Course, BlogPostMeta, BlogPost } from './lib/app-types'; import papersIndexRaw from '../static/papers-index.json?raw'; import peopleRaw from '../static/people.json?raw'; @@ -8,6 +8,7 @@ import featuredVenuesRaw from '../static/featured-venues.json?raw'; import venuesRaw from '../static/venues.json?raw'; import newsRaw from '../static/news.json?raw'; import courseRaw from '../static/courses.json?raw'; +import postIndexRaw from "../static/blog-index.json?raw"; import tsj from 'ts-json-schema-generator'; import Ajv from 'ajv'; @@ -20,6 +21,7 @@ const featuredVenues = JSON.parse(featuredVenuesRaw) as FeaturedVenue[]; const venues = JSON.parse(venuesRaw) as Venue[]; const news = JSON.parse(newsRaw) as Paper[]; const courses = JSON.parse(courseRaw) as Course[]; +const postIndex = JSON.parse(postIndexRaw) as BlogPostMeta[]; test('Web names should be unique', () => { const webNames = new Set(papers.map((paper) => paper.web_name)); @@ -67,6 +69,16 @@ test('All papers should have urls for their authors if possible', () => { expect(updatedPapers).toEqual(papers); }); +test('All blog posts should have date and title', () => { + const postsWithMissingData = postIndex.filter((p) => + // check if date, title, post (user-provided), and web name (auto-gen) are provided + !(p.meta.date && p.meta.title && p.meta.web_name && p.post) + // check if an external post has a headliner for preview + && (!p.meta.external || p.meta.headliner) + ); + expect(postsWithMissingData).toEqual([]); +}); + [ { key: 'Paper', dataset: papers, accessor: (paper: Paper): string => paper.web_name }, { @@ -98,7 +110,8 @@ test('All papers should have urls for their authors if possible', () => { key: 'Course', dataset: courses as Course[], accessor: (course: Course): string => course.name - } + }, + { key: 'BlogPost', dataset: postIndex, accessor: (post: BlogPost): string => post.meta.web_name } ].forEach(({ key, dataset, accessor }) => { test(`All ${key} values should be filled out`, () => { const ajv = new Ajv({ allErrors: true }); diff --git a/src/lib/app-types.ts b/src/lib/app-types.ts index 9c85382..5cffca3 100644 --- a/src/lib/app-types.ts +++ b/src/lib/app-types.ts @@ -83,3 +83,21 @@ export type Venue = { // venueType: 'C' | 'J' | 'B' | 'W'; venueType: 'conference' | 'journal' | 'book' | 'workshop'; }; + +export type BlogPost = { + meta: BlogPostMeta; + post: string; + first_image?: string | null; +}; + +export type BlogPostMeta = { + date: string; + display_date: string; + title: string; + web_name: string; + recent?: boolean; + headliner?: string; + banner?: string; + paper?: string; + [key: string]: any; +} \ No newline at end of file diff --git a/src/lib/pasre-post.ts b/src/lib/pasre-post.ts new file mode 100644 index 0000000..1014768 --- /dev/null +++ b/src/lib/pasre-post.ts @@ -0,0 +1,42 @@ +import type { BlogPost, BlogPostMeta } from "./app-types"; +import { parse as parseYAML } from 'yaml'; +import markdownit from 'markdown-it' + +export function parsePostData(text: string, web_name: string): BlogPost { + const parts = text.split("---\n"); + const metaRaw = parseYAML(parts.length == 3 ? parts[1] : "") as { [key: string]: any }; + if (!metaRaw.title) { + console.error("Untitled blog post.") + } + const meta: BlogPostMeta = { + date: metaRaw.date, + display_date: metaRaw.date ? (new Date(metaRaw.date.replace(/-/g, "/") + " PST")).toLocaleDateString("us-EN", { + year: "numeric", + month: "short", + day: "numeric", + }) : "Undated", + title: metaRaw.title as string, + web_name: web_name, + } + if (metaRaw.banner) meta.banner = metaRaw.banner; + if (metaRaw.headliner) meta.headliner = metaRaw.headliner; + if (metaRaw.external) meta.external = metaRaw.external; + if (meta.external && !meta.headliner) { + console.error("An external post must have a headliner."); + } + if (metaRaw.paper) meta.paper = metaRaw.paper; + + const post = parts.length == 3 ? parts[2] : parts[0]; + + const md = markdownit({ html: true, linkify: true }); + const rendered_post = md.render(post) + let first_image = meta.banner ?? rendered_post.match(/]*src="([^<>"]+)"[^<>]*>/i)?.[1] ?? null; + if (first_image && first_image.startsWith("../")) first_image = first_image.replace("../", ""); + + return { meta, post: rendered_post, first_image }; +} + +export function stripHTML(html: string) { + // getting summary text for the blog + return html.replace(/<[^<>]+>/g, "") +} \ No newline at end of file diff --git a/src/lib/post-thumb.svelte b/src/lib/post-thumb.svelte new file mode 100644 index 0000000..ac73bf9 --- /dev/null +++ b/src/lib/post-thumb.svelte @@ -0,0 +1,44 @@ + + + + {#if post.first_image} +
+
+
+ {:else} +
+
+
+ {/if} +
+
+ {post.meta.title} +
{post.meta.headliner ?? post.post}...
+
{post.meta.display_date}
+
+
+ + + diff --git a/src/routes/+layout.svelte b/src/routes/+layout.svelte index d88baee..1c2d45b 100644 --- a/src/routes/+layout.svelte +++ b/src/routes/+layout.svelte @@ -24,7 +24,7 @@ }, { name: 'blog', - href: 'https://medium.com/@uwdata' + href: `${base}/blog` // 'https://medium.com/@uwdata' }, { name: 'code', diff --git a/src/routes/blog/+page.svelte b/src/routes/blog/+page.svelte new file mode 100644 index 0000000..2c1b8af --- /dev/null +++ b/src/routes/blog/+page.svelte @@ -0,0 +1,17 @@ + + + + UW Interactive Data Lab | Blog + + +
+ {#each posts as post} + + {/each} +
diff --git a/src/routes/blog/+page.ts b/src/routes/blog/+page.ts new file mode 100644 index 0000000..17f1186 --- /dev/null +++ b/src/routes/blog/+page.ts @@ -0,0 +1,10 @@ +import { base } from '$app/paths'; +import type { BlogPost } from '$lib/app-types'; +import type { PageLoad } from './$types'; + +export const load: PageLoad = async ({ fetch }) => { + const posts = await fetch(`${base}/blog-index.json`) + .then((x) => x.json() as Promise); + + return { posts }; +}; diff --git a/src/routes/blog/[slug]/+error.svelte b/src/routes/blog/[slug]/+error.svelte new file mode 100644 index 0000000..74141fe --- /dev/null +++ b/src/routes/blog/[slug]/+error.svelte @@ -0,0 +1,8 @@ + + +{#if $page.error && $page.error.message} +

{$page.status}: {$page.error.message}

+{/if} diff --git a/src/routes/blog/[slug]/+page.svelte b/src/routes/blog/[slug]/+page.svelte new file mode 100644 index 0000000..5c2cac3 --- /dev/null +++ b/src/routes/blog/[slug]/+page.svelte @@ -0,0 +1,275 @@ + + + + UW Interactive Data Lab | Blog + + {#if meta.title}{/if} + + + + + {#if meta.title}{/if} + {#if top_image}{/if} + {#if meta.title}{/if} + {#if meta.date}{/if} + + +{#if meta.banner} + +{/if} + +{#if meta.title}

{meta.title}

{/if} + +{#if meta.date}

{meta.display_date}

{/if} + +{#if meta.paper} + +{/if} + +{#if !meta.external} + +{:else} + This is an external post. Click here for redirection. +{/if} + + +
+
+

+ Back to list +

+
+
+ +

Share this post

+ + + + Bluesky + +
+
+ + +
+
+ {#if nextPost} + + ← Next post
+ {nextPost.meta.title} +
+ {/if} +
+ +
+ + +
+

Recent Posts

+ {#each recentPosts as rp} + + {/each} +
+ + diff --git a/src/routes/blog/[slug]/+page.ts b/src/routes/blog/[slug]/+page.ts new file mode 100644 index 0000000..484e880 --- /dev/null +++ b/src/routes/blog/[slug]/+page.ts @@ -0,0 +1,43 @@ +import { error } from '@sveltejs/kit'; +import { base } from '$app/paths'; +import type { PageLoad } from './$types'; +import { parsePostData } from '$lib/pasre-post'; +import type { BlogPost, Paper } from '$lib/app-types'; + + +export const load: PageLoad = async ({ params, fetch }) => { + const web_name = params.slug; + const post = await fetch(`${base}/blog-assets/posts/${web_name}.md`) + .then((response) => response.text() as Promise) + .then((x) => { + if (x) { + const { meta, post, first_image } = parsePostData(x, web_name); + return { post, meta, first_image }; + } + return error(404, `Blog Post Not found`); + }) + .catch((e) => { + console.log(e); + return error(500, e.message); + }); + + // find paper + const paper = post.meta.paper ? await fetch(`${base}/papers/${post.meta.paper}.json`) + .then((x) => x ? x.json() as Promise : null) : null; + + const posts = await fetch(`${base}/blog-index.json`) + .then((x) => x.json() as Promise); + + const post_names = posts.map(x => x.meta.web_name); + + const recentPosts = posts.filter(x => x.meta.recent); + + // the post is sorted in the descending order of date (recent first) + const prevPostIndex = post_names.indexOf(web_name) + 1; + const prevPost = prevPostIndex >= 0 ? posts[prevPostIndex] : undefined; + + const nextPostIndex = post_names.indexOf(web_name) - 1; + const nextPost = nextPostIndex < posts.length ? posts[nextPostIndex] : undefined; + + return { post, recentPosts, prevPost, nextPost, paper }; +}; diff --git a/static/blog-assets/images/2015-12-17-next-steps-1.webp b/static/blog-assets/images/2015-12-17-next-steps-1.webp new file mode 100644 index 0000000..f95cc88 Binary files /dev/null and b/static/blog-assets/images/2015-12-17-next-steps-1.webp differ diff --git a/static/blog-assets/images/2015-12-17-next-steps-2.webp b/static/blog-assets/images/2015-12-17-next-steps-2.webp new file mode 100644 index 0000000..7ef43f6 Binary files /dev/null and b/static/blog-assets/images/2015-12-17-next-steps-2.webp differ diff --git a/static/blog-assets/images/2015-12-17-next-steps-3.webp b/static/blog-assets/images/2015-12-17-next-steps-3.webp new file mode 100644 index 0000000..a57fed2 Binary files /dev/null and b/static/blog-assets/images/2015-12-17-next-steps-3.webp differ diff --git a/static/blog-assets/images/2016-01-26-hops.gif b/static/blog-assets/images/2016-01-26-hops.gif new file mode 100644 index 0000000..de2a19c Binary files /dev/null and b/static/blog-assets/images/2016-01-26-hops.gif differ diff --git a/static/blog-assets/images/2016-02-23-vegalite-banner.webp b/static/blog-assets/images/2016-02-23-vegalite-banner.webp new file mode 100644 index 0000000..5f56581 Binary files /dev/null and b/static/blog-assets/images/2016-02-23-vegalite-banner.webp differ diff --git a/static/blog-assets/images/2016-07-21-atlas-1.webp b/static/blog-assets/images/2016-07-21-atlas-1.webp new file mode 100644 index 0000000..a98fdf8 Binary files /dev/null and b/static/blog-assets/images/2016-07-21-atlas-1.webp differ diff --git a/static/blog-assets/images/2016-09-27-surprise-1.webp b/static/blog-assets/images/2016-09-27-surprise-1.webp new file mode 100644 index 0000000..d05186a Binary files /dev/null and b/static/blog-assets/images/2016-09-27-surprise-1.webp differ diff --git a/static/blog-assets/images/2016-09-27-surprise-2.webp b/static/blog-assets/images/2016-09-27-surprise-2.webp new file mode 100644 index 0000000..20db1a0 Binary files /dev/null and b/static/blog-assets/images/2016-09-27-surprise-2.webp differ diff --git a/static/blog-assets/images/2016-09-27-surprise-3.webp b/static/blog-assets/images/2016-09-27-surprise-3.webp new file mode 100644 index 0000000..260644d Binary files /dev/null and b/static/blog-assets/images/2016-09-27-surprise-3.webp differ diff --git a/static/blog-assets/images/2016-09-27-surprise-4.webp b/static/blog-assets/images/2016-09-27-surprise-4.webp new file mode 100644 index 0000000..5c94db1 Binary files /dev/null and b/static/blog-assets/images/2016-09-27-surprise-4.webp differ diff --git a/static/blog-assets/images/2016-09-27-surprise-5.webp b/static/blog-assets/images/2016-09-27-surprise-5.webp new file mode 100644 index 0000000..c656bc1 Binary files /dev/null and b/static/blog-assets/images/2016-09-27-surprise-5.webp differ diff --git a/static/blog-assets/images/2016-09-27-surprise-6.webp b/static/blog-assets/images/2016-09-27-surprise-6.webp new file mode 100644 index 0000000..0551c5b Binary files /dev/null and b/static/blog-assets/images/2016-09-27-surprise-6.webp differ diff --git a/static/blog-assets/images/2016-09-27-surprise-7.webp b/static/blog-assets/images/2016-09-27-surprise-7.webp new file mode 100644 index 0000000..b7595ee Binary files /dev/null and b/static/blog-assets/images/2016-09-27-surprise-7.webp differ diff --git a/static/blog-assets/images/2017-05-02-regression-by-eye-1.webp b/static/blog-assets/images/2017-05-02-regression-by-eye-1.webp new file mode 100644 index 0000000..5db7318 Binary files /dev/null and b/static/blog-assets/images/2017-05-02-regression-by-eye-1.webp differ diff --git a/static/blog-assets/images/2017-05-02-regression-by-eye-2.webp b/static/blog-assets/images/2017-05-02-regression-by-eye-2.webp new file mode 100644 index 0000000..54688e7 Binary files /dev/null and b/static/blog-assets/images/2017-05-02-regression-by-eye-2.webp differ diff --git a/static/blog-assets/images/2017-05-02-regression-by-eye-3.webp b/static/blog-assets/images/2017-05-02-regression-by-eye-3.webp new file mode 100644 index 0000000..a9a8c4a Binary files /dev/null and b/static/blog-assets/images/2017-05-02-regression-by-eye-3.webp differ diff --git a/static/blog-assets/images/2017-05-02-regression-by-eye-4.webp b/static/blog-assets/images/2017-05-02-regression-by-eye-4.webp new file mode 100644 index 0000000..78762bb Binary files /dev/null and b/static/blog-assets/images/2017-05-02-regression-by-eye-4.webp differ diff --git a/static/blog-assets/images/2017-05-02-regression-by-eye-5.webp b/static/blog-assets/images/2017-05-02-regression-by-eye-5.webp new file mode 100644 index 0000000..f1b7bf0 Binary files /dev/null and b/static/blog-assets/images/2017-05-02-regression-by-eye-5.webp differ diff --git a/static/blog-assets/images/2017-05-02-regression-by-eye-6.webp b/static/blog-assets/images/2017-05-02-regression-by-eye-6.webp new file mode 100644 index 0000000..346efb7 Binary files /dev/null and b/static/blog-assets/images/2017-05-02-regression-by-eye-6.webp differ diff --git a/static/blog-assets/images/2017-05-02-regression-by-eye-7.webp b/static/blog-assets/images/2017-05-02-regression-by-eye-7.webp new file mode 100644 index 0000000..011f2e8 Binary files /dev/null and b/static/blog-assets/images/2017-05-02-regression-by-eye-7.webp differ diff --git a/static/blog-assets/images/2017-05-02-regression-by-eye-8.webp b/static/blog-assets/images/2017-05-02-regression-by-eye-8.webp new file mode 100644 index 0000000..0269326 Binary files /dev/null and b/static/blog-assets/images/2017-05-02-regression-by-eye-8.webp differ diff --git a/static/blog-assets/images/2017-05-02-regression-by-eye-9.webp b/static/blog-assets/images/2017-05-02-regression-by-eye-9.webp new file mode 100644 index 0000000..8210c1c Binary files /dev/null and b/static/blog-assets/images/2017-05-02-regression-by-eye-9.webp differ diff --git a/static/blog-assets/images/2017-05-23-graphscape-1.gif b/static/blog-assets/images/2017-05-23-graphscape-1.gif new file mode 100644 index 0000000..7ca0adf Binary files /dev/null and b/static/blog-assets/images/2017-05-23-graphscape-1.gif differ diff --git a/static/blog-assets/images/2017-07-11-gap-1.webp b/static/blog-assets/images/2017-07-11-gap-1.webp new file mode 100644 index 0000000..2cbbd4d Binary files /dev/null and b/static/blog-assets/images/2017-07-11-gap-1.webp differ diff --git a/static/blog-assets/images/2017-10-31-vegalite2-1.webp b/static/blog-assets/images/2017-10-31-vegalite2-1.webp new file mode 100644 index 0000000..475c550 Binary files /dev/null and b/static/blog-assets/images/2017-10-31-vegalite2-1.webp differ diff --git a/static/blog-assets/images/2017-10-31-vegalite2-2.gif b/static/blog-assets/images/2017-10-31-vegalite2-2.gif new file mode 100644 index 0000000..b791777 Binary files /dev/null and b/static/blog-assets/images/2017-10-31-vegalite2-2.gif differ diff --git a/static/blog-assets/images/2017-10-31-vegalite2-3.gif b/static/blog-assets/images/2017-10-31-vegalite2-3.gif new file mode 100644 index 0000000..b1f00ff Binary files /dev/null and b/static/blog-assets/images/2017-10-31-vegalite2-3.gif differ diff --git a/static/blog-assets/images/2017-10-31-vegalite2-banner.webp b/static/blog-assets/images/2017-10-31-vegalite2-banner.webp new file mode 100644 index 0000000..711e1f0 Binary files /dev/null and b/static/blog-assets/images/2017-10-31-vegalite2-banner.webp differ diff --git a/static/blog-assets/images/2018-04-02-multi-comparison-1.webp b/static/blog-assets/images/2018-04-02-multi-comparison-1.webp new file mode 100644 index 0000000..d938c72 Binary files /dev/null and b/static/blog-assets/images/2018-04-02-multi-comparison-1.webp differ diff --git a/static/blog-assets/images/2018-07-19-value-suppressing-1-2.webp b/static/blog-assets/images/2018-07-19-value-suppressing-1-2.webp new file mode 100644 index 0000000..09bb4af Binary files /dev/null and b/static/blog-assets/images/2018-07-19-value-suppressing-1-2.webp differ diff --git a/static/blog-assets/images/2018-07-19-value-suppressing-1.webp b/static/blog-assets/images/2018-07-19-value-suppressing-1.webp new file mode 100644 index 0000000..e48fbbc Binary files /dev/null and b/static/blog-assets/images/2018-07-19-value-suppressing-1.webp differ diff --git a/static/blog-assets/images/2018-07-19-value-suppressing-2.webp b/static/blog-assets/images/2018-07-19-value-suppressing-2.webp new file mode 100644 index 0000000..44d03fe Binary files /dev/null and b/static/blog-assets/images/2018-07-19-value-suppressing-2.webp differ diff --git a/static/blog-assets/images/2018-07-19-value-suppressing-3.webp b/static/blog-assets/images/2018-07-19-value-suppressing-3.webp new file mode 100644 index 0000000..95f69c7 Binary files /dev/null and b/static/blog-assets/images/2018-07-19-value-suppressing-3.webp differ diff --git a/static/blog-assets/images/2018-07-19-value-suppressing-4.webp b/static/blog-assets/images/2018-07-19-value-suppressing-4.webp new file mode 100644 index 0000000..fa94c95 Binary files /dev/null and b/static/blog-assets/images/2018-07-19-value-suppressing-4.webp differ diff --git a/static/blog-assets/images/2018-07-19-value-suppressing-5.webp b/static/blog-assets/images/2018-07-19-value-suppressing-5.webp new file mode 100644 index 0000000..814378f Binary files /dev/null and b/static/blog-assets/images/2018-07-19-value-suppressing-5.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-1.webp b/static/blog-assets/images/2018-10-16-hops-1.webp new file mode 100644 index 0000000..61a3c9a Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-1.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-10.webp b/static/blog-assets/images/2018-10-16-hops-10.webp new file mode 100644 index 0000000..fc89873 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-10.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-11.webp b/static/blog-assets/images/2018-10-16-hops-11.webp new file mode 100644 index 0000000..581bd08 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-11.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-12.gif b/static/blog-assets/images/2018-10-16-hops-12.gif new file mode 100644 index 0000000..a5e5e05 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-12.gif differ diff --git a/static/blog-assets/images/2018-10-16-hops-13.webp b/static/blog-assets/images/2018-10-16-hops-13.webp new file mode 100644 index 0000000..04f0902 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-13.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-14.gif b/static/blog-assets/images/2018-10-16-hops-14.gif new file mode 100644 index 0000000..3eaa259 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-14.gif differ diff --git a/static/blog-assets/images/2018-10-16-hops-15.gif b/static/blog-assets/images/2018-10-16-hops-15.gif new file mode 100644 index 0000000..d6c2df5 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-15.gif differ diff --git a/static/blog-assets/images/2018-10-16-hops-16.webp b/static/blog-assets/images/2018-10-16-hops-16.webp new file mode 100644 index 0000000..f8ddd49 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-16.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-17.webp b/static/blog-assets/images/2018-10-16-hops-17.webp new file mode 100644 index 0000000..91615fb Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-17.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-2.webp b/static/blog-assets/images/2018-10-16-hops-2.webp new file mode 100644 index 0000000..17f5859 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-2.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-3.gif b/static/blog-assets/images/2018-10-16-hops-3.gif new file mode 100644 index 0000000..4c67222 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-3.gif differ diff --git a/static/blog-assets/images/2018-10-16-hops-4.gif b/static/blog-assets/images/2018-10-16-hops-4.gif new file mode 100644 index 0000000..88166a6 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-4.gif differ diff --git a/static/blog-assets/images/2018-10-16-hops-5.webp b/static/blog-assets/images/2018-10-16-hops-5.webp new file mode 100644 index 0000000..b971bd9 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-5.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-6.webp b/static/blog-assets/images/2018-10-16-hops-6.webp new file mode 100644 index 0000000..36f1b38 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-6.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-7.gif b/static/blog-assets/images/2018-10-16-hops-7.gif new file mode 100644 index 0000000..1fa70a1 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-7.gif differ diff --git a/static/blog-assets/images/2018-10-16-hops-8.webp b/static/blog-assets/images/2018-10-16-hops-8.webp new file mode 100644 index 0000000..25ba617 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-8.webp differ diff --git a/static/blog-assets/images/2018-10-16-hops-9.webp b/static/blog-assets/images/2018-10-16-hops-9.webp new file mode 100644 index 0000000..0215988 Binary files /dev/null and b/static/blog-assets/images/2018-10-16-hops-9.webp differ diff --git a/static/blog-assets/images/2018-10-22-draco-1.webp b/static/blog-assets/images/2018-10-22-draco-1.webp new file mode 100644 index 0000000..a0098cf Binary files /dev/null and b/static/blog-assets/images/2018-10-22-draco-1.webp differ diff --git a/static/blog-assets/images/2018-10-22-draco-2.webp b/static/blog-assets/images/2018-10-22-draco-2.webp new file mode 100644 index 0000000..cfb3bfe Binary files /dev/null and b/static/blog-assets/images/2018-10-22-draco-2.webp differ diff --git a/static/blog-assets/images/2018-10-22-draco-3.webp b/static/blog-assets/images/2018-10-22-draco-3.webp new file mode 100644 index 0000000..bba5ad8 Binary files /dev/null and b/static/blog-assets/images/2018-10-22-draco-3.webp differ diff --git a/static/blog-assets/images/2018-10-22-draco-4.webp b/static/blog-assets/images/2018-10-22-draco-4.webp new file mode 100644 index 0000000..d06e0e8 Binary files /dev/null and b/static/blog-assets/images/2018-10-22-draco-4.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-1.webp b/static/blog-assets/images/2019-08-12-errudite-1.webp new file mode 100644 index 0000000..0c716cd Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-1.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-10.webp b/static/blog-assets/images/2019-08-12-errudite-10.webp new file mode 100644 index 0000000..3db7134 Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-10.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-11.webp b/static/blog-assets/images/2019-08-12-errudite-11.webp new file mode 100644 index 0000000..6ae4456 Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-11.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-12.webp b/static/blog-assets/images/2019-08-12-errudite-12.webp new file mode 100644 index 0000000..e0c2bb5 Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-12.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-13.webp b/static/blog-assets/images/2019-08-12-errudite-13.webp new file mode 100644 index 0000000..84bf41a Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-13.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-14.webp b/static/blog-assets/images/2019-08-12-errudite-14.webp new file mode 100644 index 0000000..80a22e8 Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-14.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-15.webp b/static/blog-assets/images/2019-08-12-errudite-15.webp new file mode 100644 index 0000000..5c9db9a Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-15.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-2.webp b/static/blog-assets/images/2019-08-12-errudite-2.webp new file mode 100644 index 0000000..4b5a8d7 Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-2.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-3.webp b/static/blog-assets/images/2019-08-12-errudite-3.webp new file mode 100644 index 0000000..4847de0 Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-3.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-4.webp b/static/blog-assets/images/2019-08-12-errudite-4.webp new file mode 100644 index 0000000..4192e4e Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-4.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-5.webp b/static/blog-assets/images/2019-08-12-errudite-5.webp new file mode 100644 index 0000000..e55d997 Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-5.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-6.webp b/static/blog-assets/images/2019-08-12-errudite-6.webp new file mode 100644 index 0000000..40e274a Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-6.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-7.webp b/static/blog-assets/images/2019-08-12-errudite-7.webp new file mode 100644 index 0000000..f56171b Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-7.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-8.webp b/static/blog-assets/images/2019-08-12-errudite-8.webp new file mode 100644 index 0000000..cfdc50a Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-8.webp differ diff --git a/static/blog-assets/images/2019-08-12-errudite-9.webp b/static/blog-assets/images/2019-08-12-errudite-9.webp new file mode 100644 index 0000000..8bd88dd Binary files /dev/null and b/static/blog-assets/images/2019-08-12-errudite-9.webp differ diff --git a/static/blog-assets/posts/2015-12-17-next-steps.md b/static/blog-assets/posts/2015-12-17-next-steps.md new file mode 100644 index 0000000..b520c5d --- /dev/null +++ b/static/blog-assets/posts/2015-12-17-next-steps.md @@ -0,0 +1,65 @@ +--- +date: 2015-12-17 +title: "Next Steps for Data Visualization Research" +--- + +Given its youth and interdisciplinary nature, research methods and training in the field of data visualization are still developing. So, we asked ourselves: what steps might help accelerate the development of the field? Based on a group brainstorm and discussion, this article shares some of the proposals we found most promising. We hope that other researchers will join us in ongoing discussion and experiment with new approaches. + +![](../blog-assets/images/2015-12-17-next-steps-1.webp) +*Foundational works in data visualization history. Clockwise from the bottom-left: William Playfair’s Commercial and Political Atlas (1786), Cleveland & McGill’s graphical perception studies (1984), Jacques Bertin’s Semiology of Graphics (1968), and the PRIM-9 system (1973) by Fisherkeller, Friedman and Tukey.* + +### Background + +Though the *practice* of data visualization [stretches back for centuries](http://www.datavis.ca/milestones/), concerted *academic research* on the topic is relatively young. Two watershed moments include the publication of Bertin’s [Semiology of Graphics](http://www.amazon.com/Semiology-Graphics-Diagrams-Networks-Maps/dp/1589482611) in 1968 and Tukey’s Exploratory Data Analysis in 1970. These works were followed by early research on interactive statistical graphics (such as the [PRIM-9 system](http://stat-graphics.org/movies/prim9.html)) and then, in the 1980s, by [graphical perception research](https://www.cs.ubc.ca/~tmm/courses/cpsc533c-04-spr/readings/cleveland.pdf). + +In terms of the current research community, the IEEE Visualization conference first met in 1990, joined by Information Visualization in 1995 and Visual Analytics Science & Technology (VAST) in 2006. These conferences simultaneously convene each year at the [IEEE VIS](http://ieeevis.org/) meeting, with articles published in the journal [IEEE Transactions on Visualization and Computer Graphics (TVCG)](http://www.computer.org/web/tvcg). Visualization research papers also regularly appear at human-computer interaction conferences (such as ACM CHI and UIST), statistics conferences (JSM), and other venues. + +Visualization research contributions include new visualization techniques or software systems, the results of controlled human-subjects experiments or qualitative studies of visualization use, and theoretical treatments of visual representations, interaction techniques or design processes. Published work typically draws on research methods from one or more parent disciplines such as computer science, human-computer interaction, statistics, cartography, and perceptual and cognitive psychology. + +### Adapting the Publication and Review Process + +*Provide reviewer guidelines*. Currently, the primary sources of guidance for authors and reviewers at IEEE VIS are the [paper submission guidelines](http://ieeevis.org/year/2015/info/call-participation/paper-submission-guidelines) and [reviewer ethics guidelines](http://vgtc.org/about_us/conferences/ethics-guidelines). While both are valuable, more systematic criteria regarding specific contribution types (such as techniques, systems, and controlled experiments) might help enforce rigorous, constructive reviews. Analogous to medical practice, reviewers might benefit from “checklist” aids, organized by contribution type, to help ensure necessary pre-conditions for publication are met. At the same time, it is important to draw attention to known reviewer biases; examples include fixating on shortcomings over positive contributions, over-weighting easily fixable flaws, under-valuing novel but “unpolished” ideas, and letting initial impressions (whether positive or negative) unduly affect the scope and rigor of subsequent review. Annotated examples of both “good” and “bad” reviews could serve as valuable guides, especially for fledgling reviewers in the field. + +*Craft more targeted review forms*. At many conferences, a review consists of a single numerical rating plus textual commentary. However, journal reviews often involve a number of more specific ratings relating to the soundness of methods, quality of writing, and so on. We might reconsider the design of review forms to help ensure comprehensive reviewer attention. Building on the previous proposal, given a contribution type with systematic review guidelines, the review form might include a checkbox a reviewer must click to indicate that they have read and considered those guidelines. + +*Publish the reviews*. Once accepted, only the final revision of a paper and its supplemental material are published. The content of peer reviews and discussion is visible only to the authors and reviewers. This hides from public view valuable material regarding both the contributions and shortcomings of the published work. Publishing the reviews of accepted papers (maintaining reviewer anonymity) would provide context for assessing research contributions, generate example reviews to learn from, and raise the level of reviewer accountability. As a first step, authors might opt-in to published reviews, as now allowed by [Nature Communications](http://www.nature.com/ncomms/2015/151214/ncomms10277/full/ncomms10277.html). + +*Accompany articles with editorial statements*. Short of making all reviews public, published papers might be accompanied by a public summary review. This statement could highlight the research contributions that motivated acceptance, along with identified shortcomings or disagreement among reviewers. Statements could be curated by the primary committee member responsible for a paper (who is already tasked with writing a private summary review) with oversight by the papers chairs. + +*Require necessary supplemental material*. Video figures or interactive demos are commonly included alongside a paper, as static images fail to convey an interactive experience. Benchmark studies require access to the systems and environments tested, and have received increased attention in computer systems research. For controlled experiments, the backing data, stimuli, analysis scripts and specific task instructions are critical. Sometimes these can be reasonably described in the paper text, but often not. To foster replication and more substantive peer-review, we might institute more formal requirements around supplemental material. A healthy and growing trend is to use [online](https://github.com/TuftsVALT/ranking-correlation) [repositories](https://github.com/mjskay/ranking-correlation) to provide “living” supplementary material, which can be shared, copied and extended over time. + +### Promoting Discussion and Accretion + +*Public discussion forums*. Discussion of research papers actively occurs at conferences, on social media, and within research groups. Much of this discussion is either ephemeral or non-public. The community might benefit from a shared forum for research discussion, safeguarded with a modicum of neutral editorial oversight. To facilitate ongoing discussion, in-person questions after a conference presentation might explicitly transition to the online forum, with a provided URL at which the conversation can continue. A scribe might seed the discussion with a record of what was said at the conference. The end product could be living, annotated proceedings. + +*Templates to structure critique*. Many people are often hesitant to share critical feedback, in part to avoid alienating others. Structured templates may help people formulate comprehensive, constructive critiques. The [Stanford d.school](http://dschool.stanford.edu/) advocates an [“I Like, I Wish, What If?”](https://dschool.stanford.edu/wp-content/themes/dschool/method-cards/i-like-i-wish-what-if.pdf) format to encourage both positive and corrective comments, as well as forward-thinking extrapolation. Might we use an analogous format to scaffold research critique, which might better engage students in the process? + +![](../blog-assets/images/2015-12-17-next-steps-2.webp) +*John Tukey’s commentary on “Dynamic Graphics for Data Analysis” (1987).* + +*Response letters to journals*. A sometimes overlooked part of the research literature is response letters, which place critique into the research record. For example, Becker et al.’s classic piece on [“Dynamic Graphics for Data Analysis”](https://scholar.google.com/scholar?cluster=14817303117298653693) is immediately followed by a commentary from John Tukey. Early issues of the Human-Computer Interaction journal contain a fascinating [back](http://www.tandfonline.com/doi/abs/10.1207/s15327051hci0103_1) [and](http://www.tandfonline.com/doi/abs/10.1207/s15327051hci0203_3) [forth](http://www.tandfonline.com/doi/abs/10.1207/s15327051hci0203_4) among pioneers of the field. Published commentary, vetted by editorial or peer review, could be re-instigated in the field. + +*Replication and meta-analysis*. “Discussion” should also play out across multiple research publications. Insufficient effort is done to [replicate](https://scholar.google.com/scholar?cluster=5356311075399251558) and [verify](https://scholar.google.com/scholar?cluster=15284565102449657706) the results of prior studies. We could be producing and publishing more of this work. Once topics have received sufficient attention, [meta-analyses](https://en.wikipedia.org/wiki/Meta-analysis) could help consolidate the field’s understanding. + +### Research Methods Training + +*Promote a core curriculum*. A number of universities include a research-oriented data visualization class and related classes in HCI, statistics, and experimental design. However, some universities may lack such courses, or students may fail to take them. Developing a core curriculum for data visualization research might help both cases, guiding students and instructors alike. For example, recognizing that empirical methods were critical to multiple areas of computer science, Stanford CS faculty organized a new course on [Designing Computer Science Experiments](http://sing.stanford.edu/cs303-sp11/). + +![](../blog-assets/images/2015-12-17-next-steps-3.webp) +*Pierre Dragicevic’s cartoon comparison of statistical methods (2014).* + +*Catalog online resources*. A core curriculum could be reinforced with a catalog of learning resources, ranging from tutorials and self-guided study to online courses. Useful examples include Jake Wobbrock’s [Practical Statistics for HCI](https://medium.com/@uwdata/next-steps-for-data-visualization-research-3ef5e1a5e349#:~:text=Practical%20Statistics%20for%20HCI) and Pierre Dragicevic’s resources for [reforming statistical practice](http://www.aviz.fr/badstats). + +*Provide tutorials*. Of course, we should also be providing appropriate methods tutorials in our home organizations and at conferences. Existing tutorials usefully cover tools (e.g., [D3](http://www.jeromecukier.net/blog/2012/10/15/d3-tutorial-at-visweek-2012/)) and visualization design topics (e.g., [color design](http://www.stonesc.com/VisCourses.htm)), but rarely concern fundamental research methods training. The first step is to recruit high-quality instructors. The second step is to promote the tutorials so that advisors and students are made aware and motivated to attend. Tutorial materials should also be made available online so that those who can’t attend in person can still benefit. + +*Seek help*. Interdisciplinary research requires interdisciplinary expertise. As needed, we should seek out collaborators with complementary skills to help ensure world-class research. Some university departments staff “help desks.” Here at UW, one can get free consulting on topics including [statistics](http://www.csss.washington.edu/consulting), [design](http://depts.washington.edu/deshelp/about/) and [data-intensive science](http://escience.washington.edu/office-hours/). We should take advantage of such resources. + +*Recreate prior work*. One of the most powerful and immediate things a new researcher can do is replicate prior work. Interested in a specific technical area of visualization? Start by re-implementing techniques described in papers. You’ll hone your skills, gain in-depth knowledge into the techniques, and perhaps spark new research ideas. Interested in conducting human-subjects studies? Replicate (and potentially extend) prior published work. Platforms for deploying [experiments](http://www.labinthewild.org/) [online](https://www.mturk.com/mturk/welcome) make this relatively easy. You’ll quickly learn if the paper you are replicating faithfully communicates the information you need to reproduce the study. You will be forced to work through each detail of both the design and analysis of the experiment. This practice will help prepare you for future independent research. Plus, a replication might lead to unexpected or more nuanced results! + +### Going Forward + +These proposals are just some of the ideas we discussed, and are not intended to be comprehensive. Do you agree or disagree with any of the proposals above? Can you think of other proposals you believe the field should consider? Which steps should we prioritize and implement going forward? And, critically, how might we evaluate their impact? + +Please share your comments with us, here or on Twitter ([@uwdata](https://twitter.com/uwdata)). We want to thank brendan o'connor for his contributions via Twitter and the [Gordon and Betty Moore Foundation](https://www.moore.org/programs/science/data-driven-discovery) for their support. + +*This post was collaboratively written by the IDL team.* diff --git a/static/blog-assets/posts/2016-01-26-hops.md b/static/blog-assets/posts/2016-01-26-hops.md new file mode 100644 index 0000000..26291d1 --- /dev/null +++ b/static/blog-assets/posts/2016-01-26-hops.md @@ -0,0 +1,10 @@ +--- +date: 2016-01-26 +title: "Hypothetical Outcome Plots: Experiencing the Uncertain" +external: "https://medium.com/hci-design-at-uw/hypothetical-outcomes-plots-experiencing-the-uncertain-b9ea60d7c740" +headliner: "If you are like most people, including many data analysts, interpreting visualizations of uncertainty feels hard and abstract." +banner: ../blog-assets/images/2016-01-26-hops.gif +paper: hops +--- + +This is an external post. \ No newline at end of file diff --git a/static/blog-assets/posts/2016-02-23-vegalite.md b/static/blog-assets/posts/2016-02-23-vegalite.md new file mode 100644 index 0000000..af15d94 --- /dev/null +++ b/static/blog-assets/posts/2016-02-23-vegalite.md @@ -0,0 +1,10 @@ +--- +date: 2016-02-23 +title: "Introducing Vega-Lite" +external: "https://medium.com/hci-design-at-uw/introducing-vega-lite-438f9215f09e" +headliner: "Today we are excited to announce the official 1.0 release of Vega-Lite, a high-level format for rapidly creating visualizations for analysis and presentation." +banner: ../blog-assets/images/2016-02-23-vegalite-banner.webp +paper: vega-lite +--- + +This is an external post. \ No newline at end of file diff --git a/static/blog-assets/posts/2016-04-06-resposne.md b/static/blog-assets/posts/2016-04-06-resposne.md new file mode 100644 index 0000000..0248ea1 --- /dev/null +++ b/static/blog-assets/posts/2016-04-06-resposne.md @@ -0,0 +1,49 @@ +--- +date: 2016-04-06 +title: "Author’s Response to Stephen Few’s critique of Hypothetical Outcome Plots" +--- +Hypothetical outcome plots (HOPs) are an approach to visualizing uncertainty using a set of discrete outcomes. HOPs consist of a set of frames, each depicting a draw from a theoretical distribution, which are presented using animation or small multiples. HOPs can act as an alternative to static representations including error bars, standard pdf plots, violin plots or gradient plots or provide an additional layer of information on such plots. To learn more about HOPs, consult our [paper and related materials](http://journals.plos.org/plosone/article?id=10.1371%2Fjournal.pone.0142444) or [blog post](https://medium.com/hci-design-at-uw/hypothetical-outcomes-plots-experiencing-the-uncertain-b9ea60d7c740). + +HOPs may be especially useful for more complex plots, for which there is no standard static representation of uncertainty. Our recent paper, however, focused on simple one-, two-, and three-variable outcome plots. It provided empirical evidence from an experiment showing that untrained users are able to interpret HOPs to assess probabilities as well as or better than they are able to interpret error bars or violin plots. + +Recently, Stephen Few, a visualization practitioner, [critiqued](http://www.perceptualedge.com/blog/?p=2275) the experiments in our paper. Two major arguments about the validity of our study were made in the critique: + +1. *Appropriateness of task*: The inferences that subjects’ made about one, two and three variable plots in the study were not representative of the inferences that people make or are supposed to make with visualizations of distributions. Instead, the study was designed to favor HOPs because subjects report discrete numbers which can be counted with HOPs but not with error bars or violin plots. + +2. *Appropriateness of representation*: The probability questions that subjects were asked could be better answered by a different representation (at the extreme, presenting only a number). + +We take this opportunity to clarify why the results of our experiments are valid and to reflect on the uses for, and evaluation of, uncertainty visualizations. + +*Critique 1: The inferences that subjects made about one, two and three variable plots in the study were not representative of the inferences that users make, or are supposed to make, with visualizations of distributions.* + +*Our Response*: We agree with Few that distribution plots for single random variables are often used to make inferences about central tendency, spread, and shape, such as in exploratory analysis. However, we disagree that these are the only inferences people make from distribution plots. It is also common to use distribution plots to make rough estimates of probabilities for one and two-sided intervals (properties of the cumulative distribution function). + +Consider, for example, the common case in the scientific literature where an author presents uncertainty visualizations like error bars or suggested alternatives like violin plots or gradient plots ([Correll and Gleicher 2014](http://graphics.cs.wisc.edu/Vis/ErrorBars/)) in reporting analysis results. The error bar is included to allow a viewer to get a sense of the dispersion of hypothetical statistics (e.g., sample means); in other words, the error bar shows information about the sampling distribution. This distributional information is intended to allow the user to assess the reliability of the plotted statistic. That is, the viewer should be able to make a rough judgment about the probability that the result would be above or below a threshold, or within a given interval, upon replication. + +Few also suggests that when a joint distribution of multiple random variables is presented, comparative judgments (e.g., pr(B > A)) are not common. We disagree. In fact, it is quite common in many real world tasks. For example, we might use collected data to predict the probability that one bus will reach our bus stop sooner than another, that one athlete will run faster than another in a race, or that one stock will have a higher value at a predetermined sell date. There are also analogous non-prediction tasks similar to the question we asked. For example, given historical data, was bus A or bus B more likely to arrive at stop Y first. A natural extension to the tasks we used in our study incorporates effect size in addition to ordering. For example, we could ask the subject to estimate the probability that replicating the experiment would show an effect at least as big as some threshold. + +In the scientific literature, when multiple error bars represent the dispersion of sampling distributions, the visualizations are also intended to enable comparative judgments. For example, what is the probability that a draw from distribution B (the sampling distribution for the mean of the treatment condition) will be larger than a draw from distribution A (the sampling distribution for the control condition)? The error bars are meant to give a viewer a way to intuitively assess such probabilities. Unfortunately, most people, including scientific researchers, fail to make such comparisons accurately with conventional representations of confidence intervals like error bars ([Belia et al. 2005](http://isites.harvard.edu/fs/docs/icb.topic477909.files/misunderstood_confidence.pdf)). These issues, among others, have led statistical reformers to call for new representations and tools that make it easier for people to assess reliability without relying solely on hypothesis tests ([Cumming 2012](https://www.routledge.com/products/9780415879682)). + +*Critique 1, continued: The study was designed to favor HOPs because subjects report discrete numbers which can be counted with HOPs but not with error bars or violin plots.* + +Evaluations typically operationalize performance as accuracy — how close the user’s inference is to some verifiable property of the data. For evaluating an uncertainty visualization, we think that the appropriate verifiable properties are probabilities of specific outcomes (e.g., the probability that a randomly selected student scored above 80; the probability that a randomly selected student from class A scored higher than a randomly selected student from B; the probability that the mean grade in class A would be higher than in class B if the exam were repeated). + +An alternative to asking subjects to report a probability is to have subjects view the uncertainty representation, report some property, and then express how confident they are about the report. This approach is problematic because there is no correct amount of confidence to report, and consequently no way to assess the correctness of responses about individual plots. It is only possible to infer that subjects are reading multiple plots inconsistently. That method is therefore less precise than asking questions with verifiable answers, as we did. + +We frame the probability questions in our study as frequencies (how many times out of 100) rather than as probabilities. Using a frequency framing to elicit probabilities has been shown to improve people’s abilities to engage in Bayesian reasoning over directly asking for probabilities ([Gigerenzer and Hoffrage 1995](http://www.cogsci.ucsd.edu/~coulson/203/GG_How_1995.pdf)), thereby reducing the noisiness of estimates. While HOPs users may count to reach their estimates, we do not know if this is the best characterization of what most users do, or the most successful strategy for using HOPs. + +*Critique 2: The probability questions that subjects were asked could be better answered by presenting an alternative visualization designed for that query.* + +*Our Response*: Taken to the extreme, a number that distills the target property into a single measure will always be the best representation for conveying that single property. However, well-designed visualizations represent data efficiently in ways that enable multiple inferences. Clearly, providing highly tailored visualizations or the answers in numeric form will make it easier for users to answer specific questions, but the fair assessment is whether users can make inferences from the generic visualizations. + +Thus, for example, if we only wanted users to be able to identify the mean score of students on an exam, it would be best to just plot that mean as a single dot, or even to provide a number with no visualization. To enable inference of only the percentage of the students who scored higher than 80, again a single number will outperform any visualization. If, on the other hand, we want users to be able to estimate the percentage of students who scored higher than all possible cutoffs *k*, then we might design a visualization such as an error bar or violin plot or HOPs with the goal of supporting these inferences. To assess whether the visualization works, it is reasonable to conduct a test of subjects’ ability to answer the question for varying cutoffs. + +Of course, there are still legitimate questions for future research about how HOPs are used and where they work best, including whether HOPs are useful in an analysis setting compared to other static or interactive representations, and by what exact mechanism they work (e.g., counting, integration through ensemble processing, etc). Our recent work provides clear empirical evidence that untrained users can use HOPs in order to read off properties of cumulative distribution functions and comparative properties of joint distributions. We suspect that HOPs will turn out to be even more useful for more complex plots, where there is no standard static representation of uncertainty. We hope others will join us in conducting further research on representations of uncertainty, including how interactive selections and hybrid visualizations that combine dynamic and static depictions support reliability judgments. + +Factual clarifications: + +1. Few refers to HOPs as our invention. This is not quite accurate. Animated HOPs or HOPs-like visualizations have appeared elsewhere in education, research, and the media. For example, [“the dance of the means”](https://www.youtube.com/watch?v=iJ4kqk3V8jQ) is used to support intuitions for inferential statistics like confidence intervals ([Cumming 2012](https://www.routledge.com/products/9780415879682)). Animated hypothetical outcomes have been used to visualize uncertainty in geospatial data ([Fisher 1994](http://cat.inist.fr/?aModele=afficheN&cpsidt=4262227), [Ehlschlaeger et al. 1997](https://www.researchgate.net/profile/Charles_Ehlschlaeger/publication/222051179_Visualizing_spatial_data_uncertainty_using_animation/links/5400bf590cf23d9765a44f8a.pdf), [Bastin et al. 2002](http://www.sciencedirect.com/science/article/pii/S0098300401000516)). HOPs-like visualizations also appear in recent interactives by Amanda Cox and others at the New York Times (e.g., http://goo.gl/Oiq3W6, http://goo.gl/Df8orO, http://goo.gl/PdBxfY). Our work is the first to describe and study HOPs as a generalizable technique for presenting uncertainty. This includes identifying design requirements for applying HOPs to simple plots of random variables as well as more complex data forms and visualizations (e.g., clustered network diagrams, geospatial visualizations, tree diagrams), and evaluating how inferences based on HOPs compare to those made with static representations. + +2. Few reports that in our study, the animated HOPs that subjects are shown contain approximately 100 frames randomly sampled from 5000 generated draws. This is not accurate. Each animated HOPs in our study was generated to display all 5000 draws as individual frames. The same 5000 draws was used to generate the error bars and violin plots. Few may be referring to the median number of frames that we estimate to have been displayed from the time a HOPs subject loaded the page to the time they submitted their response(s). + +*This post was authored by Jessica Hullman, in collaboration with Paul Resnick and Eytan Adar. Thanks also to Jeffrey Heer and Matthew Kay for feedback.* diff --git a/static/blog-assets/posts/2016-07-21-atlas.md b/static/blog-assets/posts/2016-07-21-atlas.md new file mode 100644 index 0000000..269f7bb --- /dev/null +++ b/static/blog-assets/posts/2016-07-21-atlas.md @@ -0,0 +1,9 @@ +--- +date: 2016-07-21 +title: "Atlas of Me: Personalized Spatial Analogy Maps for Unfamiliar Measurements" +external: "https://medium.com/hci-design-at-uw/atlas-of-me-personalized-spatial-analogy-maps-for-unfamiliar-measurements-e20566d94b52" +headliner: "We created Atlas of Me, a Chrome plugin that generates personalized spatial analogy maps for distances and areas." +banner: ../blog-assets/images/2016-07-21-atlas-1.webp +--- + +This is an external post. \ No newline at end of file diff --git a/static/blog-assets/posts/2016-09-27-surprise.md b/static/blog-assets/posts/2016-09-27-surprise.md new file mode 100644 index 0000000..df5898f --- /dev/null +++ b/static/blog-assets/posts/2016-09-27-surprise.md @@ -0,0 +1,91 @@ +--- +date: 2016-09-27 +title: "Surprise Maps: Showing the Unexpected" +paper: surprise-maps +--- + +![](../blog-assets/images/2016-09-27-surprise-1.webp) + +In 1977, Jerry Ehman — an astronomer working with the SETI project to seek out alien life — came across an interesting radio signal, one needle in the haystack of all of the electromagnetic signals that SETI monitors. An incredibly strong radio signal, one that matches many of the parameters we’d expect to see if aliens were really trying to communicate with us. So impressed was he with this data, that he circled the signal in red ink and wrote “Wow!” in the margins; it’s been called the [“Wow!” signal](https://en.wikipedia.org/wiki/Wow!_signal) ever since. + +As the Wow! signal illustrates, often when we analyze data we are not interested in business as usual: what we care about are exceptions to the rule, outliers, and generally the unexpected. Nobody brings out the red ink to circle data when everything is normal. And yet, when we present data visually, exceptions and outliers may get lost in the sea of usual variation. We need a visualization equivalent of Ehman’s “wow!” annotation. + +For geographic data, our proposed solution is called a [Surprise Map](http://idl.cs.washington.edu/papers/surprise-maps/): a form of heat map that gives more weight to *surprising* data. The idea behind Surprise Maps is that when we look at data, we often have various *models of expectation*: things we expect to see, or not see, in our data. If we have these models, we can also measure *deviation* or *difference* from these models. This deviation is the *unexpected*, the data that surprise us. Such surprising data is sometimes important, and at the very least justifies follow-up analysis. + +Surprise maps are useful when the raw numbers, by themselves, don’t tell us much: visual patterns might look complex but convey only statistical noise, or patterns may look simple but hide the really interesting features. + +## Canadian Mischief + +Here’s an example. “Mischief” is a category of property crime: it includes things like vandalism and graffiti, where the intent is neither to steal anything nor hurt anybody. We might wonder “which province or territory of Canada is the most safe from mischief?” Well, we have a list of provinces, and a **count** of events of mischief, and so a common design choice would be a choropleth map (a.k.a. heat map) of the data: + +![](../blog-assets/images/2016-09-27-surprise-2.webp) + +Looking at the map, we notice that Ontario has the most mischief. However, there is a **confound**. Ontario also has the highest *population*. More people means more crime. So let’s normalize to the per capita *rate* of crime: + +![](../blog-assets/images/2016-09-27-surprise-3.webp) + +Now the picture is nearly the opposite of what we saw in our first map. Maybe the Northwest Territories are the most dangerous!? Not so fast… the Northwest Territories are one of the least populous territories of Canada. Fewer than 44,000 people live there, compared to the more than 13 *million* people who live in Ontario. + +Are the Northwest Territories really that dangerous, or are they merely a victim of what Howard Wainer has called [“The Most Dangerous Equation?”](http://press.princeton.edu/chapters/s8863.pdf) That is, when populations are low, variation tends to be high. As an extreme example, imagine a province with only two people in it: if person A commits a crime on person B, then that’s a per capita rate of 50%, the highest in the nation! If A and B peacefully coexist, that’s a per capita rate of 0%, similarly the lowest in the nation. Neither case really offers much evidence that this two-person province is really the safest or most dangerous place to live. + +Here’s a concrete example of how population can make these maps tricky to interpret. Suppose there is a disease that is endemic to the U.S., and I want to find which counties are safest. Here is a map showing the top 10% safest counties in pink (places with the lowest per capita rate of the disease): + +![](../blog-assets/images/2016-09-27-surprise-4.webp) + +Now, let’s speculate about *why* these counties are safest. Maybe the fresh air of the great plains has something to do with it, or maybe there’s something about cities that make people susceptible. But, when we take a look at the top 10% most *dangerous* counties (in purple), we get a very similar map: + +![](../blog-assets/images/2016-09-27-surprise-5.webp) + +In reality, there is no geographic pattern whatsoever. All I have done is given each citizen of the U.S. an equal chance of infection and in effect just flipped a coin for each of them. Less populous counties have fewer coin flips, and so the impact of these flips matter more. In a way, they have more room to be “lucky” or “unlucky.” The map of rates across the entire country shows the full story (purple is more dangerous, pinker is safer): + +![](../blog-assets/images/2016-09-27-surprise-6.webp) + +As population increases, variability decreases. So we get “checker boards” of high and low values in the sparser regions of the country, and a uniform pink everywhere else. Population, not geography, is what drives these spurious “interesting” patterns. + +So let’s turn back to our example of Canadian mischief. Instead of coloring regions based on the *highest* values, let’s color them based on the most *surprising* values. We’ll describe how we calculate surprise later, but for now, let’s assume two things: + +1. If there is no big geographical differences in mischief, we’d expect each state to have the *same* per capita rate. + +2. If there is no big geographic difference in mischief, we’d expect variability to increase for smaller populations. + +Given these two **models** of how we expect the data to appear, we can measure *deviations* from these models. Counter-examples to assumptions 1 and 2 would be a province with a much higher or lower per capita rate than any of the others, or a province with a large population *and* a high deviation from the average per capita rate. A **Surprise Map** highlights where these counter-examples occur. The bluish regions are places where we have less mischief than we’d expect, given our models, and the reddish regions are where we have more mischief than we’d expect: + +![](../blog-assets/images/2016-09-27-surprise-7.webp) + +What we get is a map that is somewhere in between the previous two maps we saw: Ontario has a lot of mischief, sure, but it has much less than we’d expect, given its outsized population compared to the rest of Canada (around 500 incidents per 100,000 people, compared to an average of *3,800* incidents per 100,000 people for Canada as a whole). Nunavut and the Northwest Territories have higher per capita rates, but this variability is within reasonable limits given their tiny populations. It is the prairie provinces where we see somewhat unexpectedly high levels of mischief. With the right models to back them up, Surprise Maps suppress noisy and irrelevant patterns in maps, and visually highlight what’s left. + +### Calculating Surprise + +Surprise Maps are driven by a statistic called [**Bayesian Surprise**](http://ilab.usc.edu/surprise/). Bayesian Surprise is a measurement initially developed by vision researchers to help identify the most salient (interesting) parts of an image or video. The key idea is that it is not the data by itself that drives interest, but how the data shifts our **models of expectation**. *The same data can produce different levels of surprise, based on our underlying beliefs about that data.* + +Here’s an example. Suppose you are a student taking a class, and you have two different potential expectations: + +1. I am going to pass this class. + +2. I am going to fail this class. + +Suppose you have been a good student so far, and so your prior belief that you will pass the class (1) is high, and your belief that you will fail the class (2) is low. Suppose now that you are handed back a midterm test, and you receive an extremely low grade. This new data is very surprising and it forces you to update your beliefs: you are now much less certain that you will pass the class, and more certain you will fail. Similarly, suppose that you were pretty sure you were going to fail the class before the midterms were handed back (you haven’t been attending lecture, or the material is out of your depth, or some other strong belief). When you receive your failing midterm score, you have certainly received new information, but this information is not very surprising: you already strongly suspected you were not going to do well. Depending on our *prior* beliefs, the same (low) midterm score can cause different levels of surprise. + +Surprise Maps leverage the same intuition: we select an initial set of models and initial beliefs about each of those models. The most surprising information is that which causes the biggest shift in our beliefs about those models: strong evidence for a model we had little belief in, or strong counter-evidence for a model we thought was a sure bet. + +The procedure for generating a Surprise Map is then: + +1. Select a set of potential models for the data. Connected with each of these models is an initial belief (a **Bayesian prior**) about how likely this model is to be true. Initially, our models might be equiprobable: we have no strong initial guess as to what we expect to see. + +2. Compare the expected distribution of data to the actual distribution. This allows us to estimate a **likelihood** that we would see our real data, if our model(s) were correct. + +3. Using [**Bayes’ Theorem**](https://en.wikipedia.org/wiki/Bayes%27_theorem), calculate the **posterior** probability of each model. That is, how accurate is our belief in our model given the real data we just observed? (For example, that we are passing the class, or that Manitoba will have the same rate of mischief as Alberta.) + +4. Calculate surprise as a difference between our **prior** and **posterior** probabilities, across all models. High surprise occurs when beliefs shift rapidly; low surprise occurs where there is not much change (we already knew we were failing, and this new ‘F’ grade doesn’t change our minds). + +5. Visualize the surprise values. One can plot either total surprise or signed surprise (where we see if our surprise is caused by over- or under-estimation of the data). Negative surprise is where we see lower quantities than we were expecting, positive is where we see higher quantities than we expected. + +For Surprise Maps, we do not need to choose particularly complex models, so long as the deviation from these models is informative. For instance, the assumption that each Canadian province will have the same rate of mischief (model 1, above) is somewhat naïve: different provinces have different levels of urbanization and poverty and other confounding variables that may impact crime rates. Yet, this initial model is both easy to describe, and has meaningful definitions of counterexamples or deviations. Surprise Maps are intended to work with a small set of simple models, where deviation is meaningful to the analyst. These coarse models function as initial rough guesses as to how the data might appear. + +## Conclusion + +By visualizing surprise, rather than just the data, we make the informed choice to highlight the unexpected, at the expense of the normal. This might not be appropriate for all tasks. For instance, if I really want to know the exact rate of mischief in Ontario, the Surprise Map won’t give me that information. The more traditional choropleth map would be the better choice. Surprise Maps are intended for situations where there are biases and confounding variables at play, and the raw numbers may mislead the viewer. These situations happen quite often in visual analysis: spurious patterns, noise masquerading as signal, and insufficiently strong evidence are all problems that arise when we visualize geographic data. They are also appropriate in situations where we have strong prior beliefs, or strong confounding variables, that we wish to account for. We’ve applied Surprise Maps to bird death data (where seasonal patterns of mortality can drown out interesting signals), natural disasters, and even voting patterns. Surprise Maps allow us to apply techniques of statistical modeling, but in the context of a simple color-coded map. + +For more information about Surprise Maps, read our paper, “[Surprise! Bayesian Weighting for De-Biasing Thematic Maps](http://idl.cs.washington.edu/papers/surprise-maps/),” to be presented at the [IEEE VIS](http://ieeevis.org/) conference in October 2016. + +*This post was authored by Michael Correll, in collaboration with Jeffrey Heer.* \ No newline at end of file diff --git a/static/blog-assets/posts/2017-05-02-regression-by-eye.md b/static/blog-assets/posts/2017-05-02-regression-by-eye.md new file mode 100644 index 0000000..4be8aeb --- /dev/null +++ b/static/blog-assets/posts/2017-05-02-regression-by-eye.md @@ -0,0 +1,85 @@ +--- +date: 2017-05-02 +title: "Regression by Eye" +paper: regression-by-eye +--- + +William Playfair was an early pioneer of information visualization. Here is one of his charts, a 1786 depiction of the national debt of England: + +![](../blog-assets/images/2017-05-02-regression-by-eye-1.webp) +*[William Playfair, The Commercial and Political Atlas, 1786.](https://commons.wikimedia.org/wiki/William_Playfair#/media/File:1786_Playfair_-_20_Chart_of_the_National_Debt_of_England_(from_3e_edition,_1801).jpg)* + +The version I’ve posted here is too small to make out many of the finer details. Yet, I’d argue that this chart still communicates a clear message, even without those details: the national debt was at the time getting larger, and this trend was accelerating. Small reversals, such as the debt decreasing from 1762–1775, are not sufficient to counteract the overall impression of a strong, increasing trend. We frequently look at charts like these, and make *visual judgments* about *statistical trends*. + +We could have fit an exponential model to this, and visualized the trend line alongside the data, but that might be overkill here. We could just look at the chart, and get a general idea of the trend. In fact, many of the things that make the statistical problem of estimating trends and creating models difficult, like overfitting, selecting a good model space, or detecting and dealing with outliers, can be assisted or even circumvented by our visual estimates. Visual judgments often act as sanity checks that the statistical assumptions we made in our models hold. + +However, successfully communicating messages like the increasing national debt requires that our audience reliably estimate trends in charts, and that we can count on people to see what we expect them to see. There is a resulting tension between statistical estimates of trend, which may have many parameters and be complex to communicate to the general audience, and visual estimates of trend, which may have errors and bias. + +Our [CHI 2017](https://chi2017.acm.org/) paper [“Regression by Eye: Estimating Trends in Bivariate Visualizations”](http://idl.cs.washington.edu/papers/regression-by-eye) investigates two questions: 1) How well do people visually estimate trends in data? and 2) Does the way we visually encode the data bias these estimates? + +As an example, here’s a scatterplot relating the weather outside to the sales of a (fictional) brand of ice cream: + +![](../blog-assets/images/2017-05-02-regression-by-eye-2.webp) + +From the chart, it seems as though ice cream sales are **strongly and positively correlated** with the temperature outside (although let’s remember that [correlation is not the same as causation](http://www.tylervigen.com/spurious-correlations)!. + +Here’s another scatterplot, showing the connection between the murder rate in all 50 US states with respect to the percentage of the population that reports owning guns: + +![](../blog-assets/images/2017-05-02-regression-by-eye-3.webp) + +Here, it seems there is a **very weak correlation** between these two variables: having gun ownership doesn’t seem very predictive of the murder rate. At the very least, the link between guns and murder seems more complex than the link between sunny days and ice cream. + +The question arises: how did you do that? It’s unlikely that you pulled out a calculator and calculated the lines of best fit manually. It’s possible you didn’t consider factors like [Pearson’s *r*](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient), or some other statistical measure of correlation. Rather, you likely made a *visual estimate* of how the points were related. While, [with training](http://guessthecorrelation.com/), we can learn to numerically estimate statistical correlation, we seem to have an ability to make general assessments on how variables are related even without this training. We make what is, at heart, a *statistical* judgment, but we do so *visually*. [Recent](http://www.cs.tufts.edu/~remco/publications/2014/InfoVis2014-JND.pdf) [work](https://idl.cs.washington.edu/files/2015-BeyondWebersLaw-InfoVis.pdf) in visualization research investigates how people visually estimate correlation in data, and finds that the visual encoding of data can have a large impact on our ability to accurately make these estimates. + +Our interest is not just in the specific estimation of the degree of correlation, but our ability to visually estimate trends in general. We refer to this overarching ability as **“regression by eye”** — the visual, rather than statistical, estimation of trend and correlation. + +The ability to estimate trends in data sets is important, and is frequently an important part of the persuasive content of a graph or chart. Take, for example, the following bar chart of U.S. unemployment: + +![](../blog-assets/images/2017-05-02-regression-by-eye-4.webp) + +A person might use this chart to show that an economic policy is working because unemployment is *trending downwards*, or alternatively to criticize economic policy because it’s not trending downwards *fast enough*, among other possible rhetorical goals. The fact that, say, unemployment was 4.7% on January 1st, 2016, is mostly relevant only in the context of a larger temporal trend. A person seeking to persuade with this chart is relying on viewers being able to perform regression by eye, and reach conclusions that are broadly similar across the intended audience. + +If regression by eye is an important part of the message that we take away from charts, it is helpful to know to what degree our visual estimates **align** with statistical regression: that is, do our estimates match up with what trends we’d see if we really did bring out the calculators? Our eyes and the statistics may disagree! When that happens, how are our judgments **biased** away from traditional lines (or curves) of best fit? + +A standard tool for calculating trend lines in statistics is [linear least squares regression](https://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)). Linear least squares works best on data with one central cluster of points and few outliers. We gave a group of people, many without any statistical training, this kind of data, and a big purple line. The participants could then adjust the purple line with a slider until it matched their best guess at the trend. We compared their guess to the least squares fit as a way of measuring alignment. We also included, for comparison, non-linear fits such as sine waves and parabolas. + +![](../blog-assets/images/2017-05-02-regression-by-eye-5.webp) +*The linear (left) trends we examined in our experiments, as well as non-linear quadratic (center) and trigonometric (right) trends. All three types of regression assume a central cluster of points.* + +In general, people were close to these standard regression fits. When estimating the slope of a linear fit (in the range [0,1]), the average error was 0.06. For estimating the y-intercept (also in the range [0,1]), the average error was 0.04. Just “eyeballing” the trend was a successful approach for getting estimates pretty close to the statistical standard. What’s more, these estimates were generally *unbiased*: people were neither systematically over- or under-estimating the trend, but erring on both sides of the line. We found equal accuracy both with linear fits like the one below, and for similar estimates in our non-linear models. + +![](../blog-assets/images/2017-05-02-regression-by-eye-6.webp) +*The purple line shows the linear line of best fit. The red area shows the average error in estimating the slope of this line.* + +However, we expect that other design choices might impact these estimates. We therefore tested situations were we expected to encounter systematic bias — not just errors, but systematic over- or under-estimation in trend. + +### Within-the-bar Bias + +![](../blog-assets/images/2017-05-02-regression-by-eye-7.webp) +*Because colored bars in bar charts seem to “contain” the values that lie within them, there is a consistent bias where outcomes inside the visual area of the bar (left) are perceived as likelier than values outside of the bar (right).* + +[Newman and Scholl](http://perception.yale.edu/papers/12-Newman-Scholl-PBR.pdf) identified a curious property of charts that rely on solid shapes and symbols, like bar charts. These charts create a false visual metaphor of containment. In a bar chart, elements may be perceived as either within a bar or outside a bar. This difference leads to a small but reliable bias in how a general audience interprets charts: *values within the bar are perceived as more likely than values outside of the bar.* + +![](../blog-assets/images/2017-05-02-regression-by-eye-8.webp) +*The purple line shows the line of best fit. The red area shows the average bias in estimating the intercept of this line, in area charts. There is a small but significant underestimation in area charts, but not in line charts or scatterplots.* + +We found that this bias extends to regression by eye. Area charts are line charts where the area under the line is filled in with a solid color. We found that people habitually underestimate the y-intercept of regression lines in area charts. In other chart types like scatterplots and line charts, where we don’t have filled in areas (and thus remove the metaphor of containment), this bias disappears. + +### Outliers and Regression by Eye + +![](../blog-assets/images/2017-05-02-regression-by-eye-9.webp) +*The upper purple line shows the “robust” line of best fit — the line of best fit if we ignore the outliers. The lower line shows the “sensitive” line of best fit — the line of best fit if we take the outliers into consideration. The average guess is in red. People were hesitant to consider outliers, but still gave them more weight than the robust line.* + +Of course, linear least squares regression is not always the best choice for fitting trends to data. In particular, it is very sensitive to outliers. When there are outliers in a dataset, the statistician must make a choice regarding how extreme values should be allowed to affect the trend. In some cases, outliers may represent bad data, or extreme but unlikely circumstances: if we are interested in the general trend, it might be wise to discount these values. On the other hand, outliers may represent a sudden change in status quo, or a shifting of the trend, in which case they should be considered just as much as any other point in the data. Statisticians must determine whether to fit a robust trend line (where outliers are excluded or down-weighted), or a sensitive trend (where outliers are fully considered). + +We tested how regression by eye was affected by the presence of outliers. In general, people select fits that are between the robust and sensitive trends: that is, they weigh outliers significantly less than the sensitive trend line, but still consider them more than a robust (outlier-ignoring) trend line. This is a *misalignment* between statistical models and regression by eye, but it’s difficult to say call this an *incorrect* judgment: what to do about outliers (or even what counts as an outlier) is a difficult problem, and these visual judgments on how to weigh extreme points may be useful for deciding between different methods of statistical regression. + +### Conclusion + +Many times, when we look at charts, we act as *visual statisticians*, estimating complex statistical phenomena like effect size, trend, and confidence. When we design charts, especially ones where we are intending to persuade people about these kinds of statistical information, we should be mindful of the capabilities of the viewer to infer the kind of statistics we care about. Designers should make sure that they are doing their best to communicate their message in a way that their audience can understand. Similarly, as viewers of visualizations, we should be aware that small changes in the visual design of charts can nudge our judgments: we should be wary of these subtle ways in which charts can influence our perception of the data. + +Our study found that, for data consistent with the assumptions of standard regression models, our *visual* estimates about trends are in line with the *statistical* estimates. In these cases, we might trust viewers to make consistent and reliable judgments about trends. However, when there are outliers in the data, people do consider these outliers when making their visual judgments: they are more conservative in their estimates of trend than outlier-sensitive statistical regression. That is, we seem to weight outliers less than the rest of the data we consider when making our estimates. In these situations, designers of visualizations ought to consider whether or not to include annotations (like trend lines, or explicit markings of outlier points) if they want to communicate specific statistical trends in a reliable way. Lastly, designers should be wary of within-the-bar bias: bar charts, area charts, and other visually asymmetric designs may introduce similarly asymmetric visual judgments. + +Our work is only an initial exploration of our capacity to act as visual statisticians: much more work remains to explore the limits of our abilities to visually estimate statistical patterns in charts. We also wish to explore a wider range of potential biases that might impact our judgments: not just judgments of statistical values themselves, but also elements like trust, uncertainty, and persuasive power. + +*This post was written by Michael Correll, with input from Jeffrey Heer.* diff --git a/static/blog-assets/posts/2017-05-23-graphscape.md b/static/blog-assets/posts/2017-05-23-graphscape.md new file mode 100644 index 0000000..bcd8098 --- /dev/null +++ b/static/blog-assets/posts/2017-05-23-graphscape.md @@ -0,0 +1,10 @@ +--- +date: 2017-05-23 +title: "GraphScape: Modeling Similarity & Sequence among Charts" +external: "https://medium.com/hci-design-at-uw/graphscape-modeling-similarity-sequence-among-charts-bd82cdbe866d" +headliner: "A single chart is often not enough to understand data and to convey a story." +banner: ../blog-assets/images/2017-05-23-graphscape-1.gif +paper: graphscape +--- + +This is an external post. \ No newline at end of file diff --git a/static/blog-assets/posts/2017-07-11-gap.md b/static/blog-assets/posts/2017-07-11-gap.md new file mode 100644 index 0000000..5cee911 --- /dev/null +++ b/static/blog-assets/posts/2017-07-11-gap.md @@ -0,0 +1,10 @@ +--- +date: 2017-07-11 +title: "Explaining the Gap: Visualizing One’s Predictions Improves Recall and Comprehension of Data" +external: "https://medium.com/hci-design-at-uw/explaining-the-gap-visualizing-ones-predictions-improves-recall-and-comprehension-of-data-ec848d5861d9" +headliner: "What if Visualizations Asked Users to Predict the Data First?" +banner: ../blog-assets/images/2017-07-11-gap-1.webp +paper: explaining-the-gap +--- + +This is an external post. \ No newline at end of file diff --git a/static/blog-assets/posts/2017-10-31-vegalite2.md b/static/blog-assets/posts/2017-10-31-vegalite2.md new file mode 100644 index 0000000..3d0b448 --- /dev/null +++ b/static/blog-assets/posts/2017-10-31-vegalite2.md @@ -0,0 +1,59 @@ +--- +date: 2017-10-31 +title: "Introducing Vega-Lite 2.0" +banner: "../blog-assets/images/2017-10-31-vegalite2-banner.webp" +paper: vega-lite +--- + +We are excited to announce the official version 2 release of [Vega-Lite](https://vega.github.io/vega-lite/), a high-level language for *rapidly creating interactive visualizations*. + +Vega-Lite enables concise descriptions of visualizations as a set of encodings that map data fields to the properties of graphical marks. Vega-Lite uses a portable [JSON](https://en.wikipedia.org/wiki/JSON) format that compiles to full specifications in the larger [Vega language](https://vega.github.io/vega/). Vega-Lite includes support for data transformations such as aggregation, binning, filtering, and sorting, as well as visual transformations such as stacking and faceting into small multiples. + +In addition to an expressive range of *static* visualizations, Vega-Lite 2.0 adds support for *flexible combinations of charts* and *interactions* such as panning, zooming, interactive filtering, and linked selection. Version 2 introduces three major additions: **view composition**, **interaction**, and **Vega 3 support**. + +### View Composition + +A powerful aspect of modular approaches to visualization is the ability to create sophisticated graphics by composing simple ones. Vega-Lite has four operators to compose charts: **layer**, **facet**, **concat** and **repeat**. With *layer*, you can stack charts on top of each other and Vega-Lite automatically determines how to align the axes and share scales. *Facet* subdivides the data into groups and creates a chart for each partition. *Concat* combines arbitrary charts into dashboard layouts. Finally, *repeat* is a data-driven way to concatenate charts. + +Most importantly, these operators can be combined! Authors can create rich multi-view graphics within one declarative specification. The example below compares weather data in New York and Seattle, *layering* data for individual years and averages within a *repeated* template for different measurements. + +![](../blog-assets/images/2017-10-31-vegalite2-1.webp) +*Three charts that show different weather measures throughout the year in Seattle and New York. you can create this chart in Vega-Lite with layering and repeating views. [Try it out!](https://vega.github.io/editor/#/examples/vega-lite/repeat_layer)* + +The *layer*, *facet*, *concat*, and *repeat* operators comprise a *view algebra* for creating a wide range of multi-view visualizations. For more about details about flexible chart composition, see the [Vega-Lite documentation](https://vega.github.io/vega-lite/docs/composition.html). + +### Interaction + +Multi-view visualizations provide a valuable way to get an overview of many variables at once. However, to investigate further and see data from multiple perspectives, interaction is critical. Vega-Lite version 2 introduces ways to describe not only visual encodings, but also to specify *interaction techniques* using a concise and composable syntax for selections. + +In Vega-Lite 2.0, visualization authors specify the *type* of selection they would like (e.g., a point or interval selection), along with possible transformations. The Vega-Lite compiler then automatically synthesizes appropriate input handling logic to implement the interaction. Once defined, interactive selections can be used to parameterize the visualization, for example to filter data, highlight points, and determine scale ranges to pan or zoom a plot. + +The plot below uses an interval selection, which causes the chart to include an interactive brush (shown in grey). The brush selection parameterizes the red guideline, which visualizes the average value within the selected interval. + +![](../blog-assets/images/2017-10-31-vegalite2-2.gif) +*An interactive moving average in Vega-Lite 2. [Try it out!](https://vega.github.io/editor/#/examples/vega-lite/selection_layer_bar_month)* + +View composition and interactive selections can be combined. Below, we use an interval selection over a set of histograms showing the distributions of different flight statistics. The selection parameterizes a filter for the yellow bars, creating a coordinated interaction commonly referred to as *brushing & linking* or *cross-filtering*. As a viewer adjusts the selection, they can immediately see how the other distributions change in response. + +![](../blog-assets/images/2017-10-31-vegalite2-3.gif) +*A crossfilter interaction in Vega-Lite 2. [Try it out!](https://vega.github.io/editor/#/examples/vega-lite/interactive_layered_crossfilter)* + +More details about Vega-Lite’s interaction primitives can be found in [the documentation](https://vega.github.io/vega-lite/docs/selection.html). For a more formal treatment, see the (best paper winning) [InfoVis 2016 research paper on the Vega-Lite language design](https://idl.cs.washington.edu/papers/vega-lite). + + + +*Vega-Lite OpenVis Conf talk* + +### Vega 3 Support + +Finally, another major change in Vega-Lite 2.0 is that it now targets the new [Vega 3.0 language](https://vega.github.io/vega/). Among other advances, Vega 3 introduces a more performant reactive dataflow runtime and richer support for interaction, layouts, data transformation, binned scales, and more. + +* * * + +In addition to the developments above, we have attempted to make Vega-Lite more consistent and powerful. As one example, data transforms are now specified using an array notation, giving authors more control over the execution order of filters, formula calculations, binning, and aggregation. + +We are excited to see what others will build with Vega-Lite! We are especially encouraged that [Jupyter Lab](https://github.com/jupyterlab/jupyterlab) (the next generation of [Jupyter Notebooks](http://jupyter.org/)) will ship with Vega and Vega-Lite by default. We are also working with the [Altair](https://altair-viz.github.io/) team to release a new version of the Altair Python API for Vega-Lite. Look for that in the months to come to create interactive, multi-view visualizations directly within the Jupyter data science environment! + +Feeling inspired? Check out the [example gallery](https://vega.github.io/vega-lite/examples/) and [applications that use Vega-Lite](https://vega.github.io/vega-lite/applications.html). Read tutorials and documentation at vega.github.io/vega-lite/. Source code is available on GitHub at https://github.com/vega/vega-lite. For updates, follow the Vega project on [Twitter at @vega_vis](https://twitter.com/vega_vis) and [Bluesky at @vega-vis.bsky.social](https://bsky.app/profile/did:plc:oioe5nvbgcid2djoiwsajkzk). + +*This post was written by [Dominik Moritz](https://twitter.com/domoritz), [Kanit “Ham” Wongsuphasawat](https://twitter.com/kanitw), [Arvind Satyanarayan](https://twitter.com/arvindsatya1), and [Jeffrey Heer](https://twitter.com/jeffrey_heer).* diff --git a/static/blog-assets/posts/2018-04-02-multi-comparison.md b/static/blog-assets/posts/2018-04-02-multi-comparison.md new file mode 100644 index 0000000..024291e --- /dev/null +++ b/static/blog-assets/posts/2018-04-02-multi-comparison.md @@ -0,0 +1,9 @@ +--- +date: 2018-04-02 +title: "Multiple Perspectives on the Multiple Comparisons Problem in Visual Analysis" +external: "https://medium.com/hci-design-at-uw/multiple-perspectives-on-the-multiple-comparisons-problem-in-visual-analysis-df7493818bbd" +headliner: "The more visual comparisons an analyst makes, the more likely they are to find spurious patterns — a version of the Multiple Comparisons Problem (MCP) well known in statistical hypothesis testing." +banner: ../blog-assets/images/2018-04-02-multi-comparison-1.webp +--- + +This is an external post. \ No newline at end of file diff --git a/static/blog-assets/posts/2018-07-19-value-suppressing.md b/static/blog-assets/posts/2018-07-19-value-suppressing.md new file mode 100644 index 0000000..c67db45 --- /dev/null +++ b/static/blog-assets/posts/2018-07-19-value-suppressing.md @@ -0,0 +1,45 @@ +--- +date: 2018-07-19 +title: "Value-Suppressing Uncertainty Palettes" +headliner: "The real world is full of uncertainty, but it can be tough to communicate that uncertainty." +paper: uncertainty-palettes +--- + + + +
+ +![](../blog-assets/images/2018-07-19-value-suppressing-1.webp) + +![](../blog-assets/images/2018-07-19-value-suppressing-1-2.webp) +
+ +*Fig. 1. A bivariate map (left) and a Value-Suppressing Uncertainty Palette (VSUP, right), showing polling data for US states before the 2016 Presidential Election. It’s tempting to interpret narrow leads for a candidate as a sure thing. However, polling data is volatile and there can be a lot of uncertainty. VSUPs intentionally make it harder to distinguish uncertain states, but allow you to make finer-grained distinctions in value when the uncertainty is low.* + +The real world is full of uncertainty, but it can be tough to communicate that uncertainty. This is especially true for data visualization, where the usual practice is to *quantify* uncertainty (turn it into a number somehow) and then *encode* uncertainty by visualizing it. This has to happen at the same time as figuring out how to represent the rest of the data. Uncertainty information inherently makes visualizations more complex: it’s more data to show, and uncertainty quantification can be a complex process that results in numbers that are difficult to interpret. + +To make matters worse, one goal of showing uncertainty is to integrate the uncertainty into your decision-making process. That is, you may want people to be less confident in decisions based on highly uncertain data. If you don’t properly integrate uncertainty information, then you risk making the uncertainty *ignorable*, such that people will ignore the risks and variation in the data, and treat things that should have a lot of uncertainty (the outcomes of elections, the effectiveness of medication or diets, or the expected arrival of transportation) as certainties. You also don’t want to be too hasty: if there’s too much uncertainty, maybe the right decision is to wait until there’s more data, or refrain from making too strong of a prediction. + +![](../blog-assets/images/2018-07-19-value-suppressing-2.webp) +*Fig. 2. A [bivariate map from Joshua Stevens](http://www.joshuastevens.net/cartography/make-a-bivariate-choropleth-map/) that shows both population density and sasquatch sightings in the continental US. The brighter the purple, the more sasquatch sightings. The brighter the green, the more population density. Bright blue counties have both dense population and lots of sasquatch sightings.* + +One strategy to make uncertainty unignorable is to use a “bivariate map.” Bivariate maps encode two types of data in the same visual channel. For example, [Joshua Stevens’ Sasquatch map](http://www.joshuastevens.net/cartography/make-a-bivariate-choropleth-map/) (Fig. 2) assigns a color to a U.S. county based on two variables: its population density, and its number of sasquatch sightings. These maps have been around for a long time (see Fig. 3), but they can be hard to interpret. The visual properties we use to represent sasquatch sightings can be difficult to disentangle from the colors we use to represent population density: we don’t perceive “40% green, 60% purple” very accurately when we look at colors! As such, bivariate maps usually limit themselves to a small set of outputs. There’s only 9 possible colors in the sasquatch map, and only 16 possible texture comparisons in the map in Fig. 3. + +![](../blog-assets/images/2018-07-19-value-suppressing-3.webp) +*Fig. 3. One of the earliest surviving examples of a bivariate color map, an [1874 map of land use from Bavarian statistician Georg Mayr](http://infowetrust.com/scroll/#mayr4). The width of the vertical red lines show the density of horses in the area, the width of the horizontal green lines the density of cattle. It’s a bit difficult to interpret, but perhaps you can make out regions where there are lots of both horses and cattle, or vice versa.* + +When we make a Value-Suppressing Uncertainty Palette, we decide to spend this limited budget of outputs in the service of integrating data and uncertainty. We give *more* distinct outputs to the bivariate map when the data are very certain, and *fewer* when the data is highly uncertain. VSUPs have an internal “tree quantization” scheme to determine which combination of data value and uncertainty value corresponds to which discrete color. When data is highly uncertain, then there’s only one output color. As certainty increases, this color has two “child colors” that divide the data domain equally, allowing us to distinguish high and low values from each other. As certainty increases again, each of these two children have two children of their own, chopping up the data domain into smaller and smaller regions, and allowing fine-grain distinctions as the level of certainty goes up. To drive this metaphor home, rather than the traditional bivariate square legend shape, we prefer to present VSUP legends as a pyramid or wedge (Fig. 4). + +![](../blog-assets/images/2018-07-19-value-suppressing-4.webp) +*Fig. 4. A traditional bivariate map assigns one unique output (in this case, a color), to every combination of the two variables of interest. VSUPs, by contrast, assume that differences between values become less important as uncertainty increases, and so gradually reduce the number of outputs as data become more uncertain. When there’s too much uncertainty, everything gets mapped to the same color.* + +Fig. 1 shows an example of a VSUP designed to show polling data prior to the 2016 U.S. Presidential Election. If the candidates are very far apart in the polls, and the margins of error very low, then it’s responsible to talk about even minute differences in polling: a candidate leading 51% to 49% with very narrow margins of error is probably going to win in that state. As such, most of the colors are devoted to these highly certain values. As the margins of error gets bigger, speculating about these small differences becomes less responsible: we devote fewer and fewer colors to them, and states can only be said to lean in one direction or the other. If a candidate is polling within 2 margins of error from their opponent, everything gets mapped to the same “tossup” color. The VSUP *suppresses* uncertain values, discouraging viewers from making predictions about them. + +![](../blog-assets/images/2018-07-19-value-suppressing-5.webp) +*Fig. 5. Standard value map (left) and VSUP (right) showing the percentage of US households using the Supplemental Nutrition Assistance Program. Without uncertainty information, Wyoming seems like a clear outlier with few households needing assistance, whereas the east coast looks like a solid mass of high values. In the VSUP, we see that Wyoming’s low population means that its estimates have high uncertainty, and may not be much different than its neighbors. Similar, it is mostly the Rust Belt (with states like New York, Pennsylvania, and Michigan) that have both high rates and high certainty in those estimates.* + +VSUPs encourage people to be cautious about their judgments when uncertainty is high, but this is not always the behavior a designer might want to see: for instance, a highly uncertain but highly important “black swan” event might deserve high salience in the display, no matter the uncertainty information. Likewise, VSUPs rely on the designer to choose important levels of uncertainty, and what counts as “too uncertain to distinguish.” This definition might not be fixed, or could change over the course of the analysis session. In that case, the designer might want to allow some interactivity or filtering to reshape the VSUP and support new tasks. + +There are more details about VSUPs, including an empirical evaluation of their effectiveness, in our [paper repository](https://github.com/uwdata/papers-vsup/). If you’d like to start making VSUPs for yourself, we’ve got [a module](https://github.com/uwdata/vsup) that plays nice with [D3.js](https://d3js.org/)! + +*This article was written by Michael Correll, Dominik Moritz, and Jeffrey Heer, describing a paper we presented at CHI 2018. For more, [read the paper](http://idl.cs.washington.edu/papers/uncertainty-palettes).* \ No newline at end of file diff --git a/static/blog-assets/posts/2018-10-16-hops.md b/static/blog-assets/posts/2018-10-16-hops.md new file mode 100644 index 0000000..e9e78ff --- /dev/null +++ b/static/blog-assets/posts/2018-10-16-hops.md @@ -0,0 +1,124 @@ +--- +date: 2018-10-16 +title: "Hypothetical Outcome Plots (HOPs) Help Users Separate Signal from Noise" +paper: hops-trends +--- + +![](../blog-assets/images/2018-10-16-hops-1.webp) + +In daily life, we often find ourselves trying to separate signal from noise. For example, does the monthly jobs report suggest a growth trend, or that the jobs rate is steady? In a pair of experiments, we found that [hypothetical outcome plots (HOPs)](https://medium.com/hci-design-at-uw/hypothetical-outcomes-plots-experiencing-the-uncertain-b9ea60d7c740) — animated samples of possible outcomes — can help people to make this judgment with greater accuracy. + +## How do people use HOPs? + +HOPs enable viewers to experience variation in outcomes over time, similar to the way we experience uncertain events in our daily lives. Research finds that “a 3 out of 5 chance of rain” is easier to interpret than “a 60% chance of rain”, suggesting that probabilities are easiest to understand when framed as frequencies of events. HOPs make this frequency framing more visceral by using animation to show frequencies over time. In recent years, researchers and data-journalists have used HOPs and other data visualizations with frequency framing to communicate uncertainty in [bus arrival times](https://idl.cs.washington.edu/files/2016-WhenIsMyBus-CHI.pdf), [findings of scientific studies](https://idl.cs.washington.edu/files/2018-ImaginingReplications-InfoVis.pdf), [hurricane locations](https://ieeexplore.ieee.org/document/7563342/), [election models](http://www.nytimes.com/newsgraphics/2014/senate-model/), and [the jobs report](https://www.nytimes.com/2014/05/02/upshot/how-not-to-be-misled-by-the-jobs-report.html). + +One example of [HOPs in the New York Times](https://www.nytimes.com/2014/05/02/upshot/how-not-to-be-misled-by-the-jobs-report.html) showed how [sampling error](https://en.wikipedia.org/wiki/Sampling_error) can lead to confusion about economic growth. The article focused on monthly reports of jobs added to the economy. They use HOPs to show how sometimes a growing economy will produce jobs numbers that look flat, and other times a stagnant economy will produce promising jobs numbers. + +![](../blog-assets/images/2018-10-16-hops-2.webp) + +
+ +![](../blog-assets/images/2018-10-16-hops-3.gif) + +![](../blog-assets/images/2018-10-16-hops-4.gif) +
+ +*From How Not to Be Misled by the Jobs Report by Irwin and Quealy at the New York Times.* + +## What do we know about HOPs? + +In [2015, Hullman and colleagues](http://idl.cs.washington.edu/papers/hops/) ran a set of experiments showing that users of HOPs made comparable or better judgments of univariate probabilities than users of error bars and violin plots. For probability judgments about multiple variables (i.e., [common language effect size](http://core.ecu.edu/psyc/wuenschk/docs30/CL.pdf)), users of HOPs were an estimated 35 to 41 percentage points more accurate than users of error bars and violin plots. + +
+ +![](../blog-assets/images/2018-10-16-hops-5.webp) + +![](../blog-assets/images/2018-10-16-hops-6.webp) + +![](../blog-assets/images/2018-10-16-hops-7.gif) +
+ +*Uncertainty visualizations compared by Hullman and colleagues: error bars, violin plots, and hypothetical outcome plots.* + +![](../blog-assets/images/2018-10-16-hops-8.webp) +*Image from an article by [Haberman and Whitney](https://www.semanticscholar.org/paper/Ensemble-Perception-%3A-summarizing-the-scene-and-the-Haberman-Whitney/9bd762eb56f1be53743201c448442a753105f5df).* + +Research on the psychology of visual perception (a.k.a., vision science) suggests that [people are able to quickly, accurately, and automatically perceive sets of visual objects](https://scholar.harvard.edu/files/alvarez/files/alvarez-2011-tics-ensemble.pdf) (i.e., ensembles). In experiments, people were able to judge the average size, location, lifelikeness, or facial expression of a set of visual objects. + +This automatic perceptual averaging occurs whether people see objects in a static view or in an animated sequence. Interestingly, people are able to accurately report the statistical properties of a set of objects without being able to remember the characteristics of individual objects. This leads us to believe that HOPs and other ensemble visualizations are processed automatically (without trying) and subconsciously (without being aware) by the visual system. + +### Why did we run a new study? + +We wanted to know whether HOPs improve users’ ability to make everyday judgments about uncertainty. Specifically, we were interested in users’ ability to infer a trend from samples of noisy data, an important judgment when interpreting common applications of statistics. To test this, we created two experiments inspired by a [NYT article about how to interpret the jobs report](https://www.nytimes.com/2014/05/02/upshot/how-not-to-be-misled-by-the-jobs-report.html). + +## Evaluating the impact of HOPs on trend perception + +### Judging trends in ambiguous data + +Do HOPs enable viewers to identify the trend in noisy data better than they could using other uncertainty visualizations? We showed Amazon Mechanical Turk workers a chart of the number of jobs added to the economy each month of a hypothetical year and asked them whether the jobs report shows a trend of no growth or growth. In order to get a sense of the task, take a look at the figure below. + +![](../blog-assets/images/2018-10-16-hops-9.webp) +*An image of the task that participants completed in our experiments. Participants judge examples of jobs numbers in the chart on the left while using the uncertainty visualizations on the right as a reference. We varied both the example charts and whether static uncertainty visualizations (e.g., error bars or static ensembles) or HOPs were used to display the two trends.* + +We conducted two controlled experiments in which participants had to make this judgment repeatedly for many examples of jobs numbers. To ensure that participants were tested across a range of task difficulty including plenty of judgments at the boundaries of their ability, we used an algorithm called a staircase that presented a more difficult example after every third correct response and an easier example after every incorrect response. + +![](../blog-assets/images/2018-10-16-hops-10.webp) +*Examples of charts judged by participants, grouped by the correct trend and difficulty of classifying the trend.* + +### Uncertainty visualizations tested + +Throughout the experiments, participants referenced a pair of uncertainty visualizations (e.g., the right side of the figure above) showing trends of no growth vs growth with uncertainty due to sampling error. + +In our first experiment, we compared HOPs to error bars, a very common uncertainty visualization. We compared participants’ performance when they used the two uncertainty visualizations below as a reference for the task. These visualizations mirrored the visualizations from the NYT article. + +
+ +![](../blog-assets/images/2018-10-16-hops-11.webp) + +![](../blog-assets/images/2018-10-16-hops-12.gif) +
+ +*[Uncertainty visualizations](https://kalealex.github.io/jobs-report-hops/Additional%20Interfaces/Conditions/) compared in our first experiment.* + +In a second experiment, we set out to test whether or not there is something helpful about *animating* possible outcomes across frames rather than aggregating them into a static display. We asked participants to make the same judgment as before, but this time the examples of jobs report numbers were shown in line charts. We compared participants’ performance when using the three uncertainty visualizations below. These uncertainty visualizations show the same lines with in one static view without animation and with animation at different frame rates (400 ms per frame and 100 ms per frame, respectively). + +
+ +![](../blog-assets/images/2018-10-16-hops-13.webp) + +![](../blog-assets/images/2018-10-16-hops-14.gif) + +![](../blog-assets/images/2018-10-16-hops-15.gif) +
+ +*[Uncertainty visualizations](https://kalealex.github.io/jobs-report-hops/Additional%20Interfaces/Conditions/) compared in our second experiment.* + +### Measuring user sensitivity to trends + +Using the ground truth about the trends shown in the examples that participants saw, we labeled each judgment as correct or incorrect. What is the best way to analyze this kind of data? + +A simple method is to compute accuracy across all the judgments a participant made when using a specific uncertainty visualization, and then compare the average accuracy people had for each type of uncertainty visualization. However, this approach does not account for the fact that the perceptual judgments people made varied in difficulty (i.e., some hypothetical jobs numbers were more ambiguous). Rather than their overall accuracy, we want to know how much evidence an observer needs before they can accurately recognize a trend. + +We borrowed an approach from [psychophysics](https://en.wikipedia.org/wiki/Psychophysics) (a methodology used by psychologists to study the relation between stimuli and sensations) and estimated just-noticeable-differences (JNDs) for each participant and uncertainty visualization type. JNDs measure the amount of physical evidence a person needs in order to detect a sensory signal. In our task, JNDs provided a measure of how pronounced the trend in ambiguous data needs to be for a participant to classify it correctly about 75% of the time. + +![](../blog-assets/images/2018-10-16-hops-16.webp) +*Difference between the two squares is the average person’s JND for brightness.* + +To get a sense of what a JND looks like, try to tell which of the two squares to the left is lighter. If it feels difficult, that’s because the difference is close to the average JND for lightness. When a person’s JND is higher, they are less sensitive to the signal in a stimulus and require more evidence to make a correct interpretation. When a person’s JND is lower, they are more sensitive to the signal in a stimulus and require less evidence to make a correct interpretation. + +Vision scientists commonly measure and compare JNDs under different display conditions in order to test the impact of specific conditions on perceptual sensitivity. In our experiments, we used JNDs to compare users’ sensitivity when using different uncertainty visualizations as a reference for the two possible underlying trends in the jobs report. + +## HOPs promote accurate perceptions of uncertainty + +![](../blog-assets/images/2018-10-16-hops-17.webp) +*Examples showing the same amount of evidence in favor of growth and no growth, respectively. When using error bars as a reference, the average user could seldom detect the difference in trends. With HOPs as a reference, the difference is detectable.* + +In our first experiment, participants were able to correctly interpret the underlying trend for more ambiguous examples of jobs numbers when using HOPs than when using error bars (see ambiguous examples to left). Perhaps this is because error bars rely on summary statistics like standard error to represent uncertainty. These statistics are not the most readily interpretable representation for uncertainty, particularly for audiences without statistical training. This interpretation of our findings is consistent with the prior work in data visualization and judgement and decision-making which suggests that people more easily understand frequency framing of outcomes. + +In our second experiment, participants were more consistently correct in their interpretation of ambiguous samples with HOPs at 400 ms per frame than with line ensembles. This suggests that *animating outcomes improves user sensitivity* to the underlying trend in ambiguous data beyond the impact of frequency framing alone. Perhaps this is because animation eliminates the visual clutter in analogous static ensembles. This interpretation is consistent with prior work on [crowding](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3070834/), perceptual limitations on the ability to accurately read dense displays. However, HOPs are also subject to perceptual limitations. Compared to 400 ms HOPs and line ensembles, 100 ms HOPs showed an intermediate effect on participants’ sensitivity to the underlying trend. This suggests that the effectiveness of HOPs is diminished for frame rates faster than the blink of an eye. + +When communicating uncertainty is a priority and animation is possible, visualization designers should consider using HOPs. + +*Authors: [Alex Kale](https://people.cs.uchicago.edu/~kalea/) & [Jessica Hullman](http://users.eecs.northwestern.edu/~jhullman/)* + +*Paper: http://idl.cs.washington.edu/papers/hops-trends* \ No newline at end of file diff --git a/static/blog-assets/posts/2018-10-22-draco.md b/static/blog-assets/posts/2018-10-22-draco.md new file mode 100644 index 0000000..4716b94 --- /dev/null +++ b/static/blog-assets/posts/2018-10-22-draco.md @@ -0,0 +1,108 @@ +--- +date: 2018-10-22 +title: "Draco: Representing, Applying & Learning Visualization Design Guidelines" +paper: draco +--- + +From [academic courses](https://courses.cs.washington.edu/courses/cse512/18sp/) to [online articles](https://blog.datawrapper.de/better-charts/), discussions of visualization often abound with design guidelines: *Don’t use pie charts! Don’t use rainbow color maps! Ensure axes include zero!* When applied in a proper context, such guidelines help prevent misleading visualizations. However, such guidelines are not always well-known, and are themselves subject to debate among practitioners, in part because the “proper context” may not be obvious. + +As a result, design guidelines can be challenging to apply, relying on people’s judgment and expertise. What guidelines does my design adhere to? Are there exceptions to the “rules”? For example, we are often told a chart’s axes should include zero, but what if we are using log scales, or using color encodings — are we breaking the rule if not using zero? An additional difficulty is that current guidelines are a bit like folklore: though often grounded in perceptual studies, guidelines are passed down in classes and writings largely as “folk wisdom” rather than concrete, contestable propositions. Furthermore, guidelines may be unfairly written off as “thou shalt not…” proscriptions of purists, rather than serving as resources that directly assist design. + +The [**Draco**](https://uwdata.github.io/draco/) project aims to make design guidelines concrete, actionable, and testable. By encoding guidelines as logical rules, we can build computational knowledge bases for automatically assessing existing charts and generating new recommended charts. Draco can warn you if your chart violates design rules and provide design suggestions, even for ambiguous requests such as “visualize the horsepower of these cars using a bar chart.” By formalizing design guidelines as constraints, visualization researchers and practitioners can add their own design considerations and directly test the results, seeing how different rules and weightings change which visualizations are considered the most preferable. + +In this article, we explain how to encode design guidelines in Draco, how Draco can be adapted quickly to incorporate new rules, and how we use machine learning to learn preference weights among guidelines. More details are available in our [InfoVis 2018 research paper about Draco](https://idl.cs.washington.edu/papers/draco/). + +## How Draco Knows How to Create Visualizations + +The first ingredient of Draco is a formal language to describe visualizations. Here we use [Vega-Lite](https://vega.github.io/vega-lite/), a high-level language for describing a variety of statistical graphics. With Vega-Lite, one can concisely describe a visualization as a set of encodings that map from data fields to the properties of graphical marks (such as *x*, *y*, *size*, or *color* properties). Vega-Lite includes data transformations such as filtering, sorting, binning, and aggregation along with visual operations including stacked layouts and faceting data into [small multiples](https://en.wikipedia.org/wiki/Small_multiple). By combining these basic building blocks, Vega-Lite users can construct an [expressive range of graphics](https://vega.github.io/vega-lite/examples/). + +![](../blog-assets/images/2018-10-22-draco-1.webp) +*Figure 1: An example of a bar chart, its Vega-Lite specification (in Vega-Lite JSON), and its equivalent specification using Draco constraints (in ASP). The specification defines the marktype and encodings, which includes a specification of the fields, data type, and data transformations.* + +Draco encodes these visualization building blocks as *logical facts*. Figure 1 shows how a Vega-Lite bar chart is expressed as logical facts in Draco. These statements formally describe properties of the input data, the specified visual encodings, and potentially even the user’s task. As we want to *automatically* reason about these logical facts, we express them using the notation of [Answer Set Programming (ASP)](https://en.wikipedia.org/wiki/Answer_set_programming), a standard format for constraint solvers. + +The second ingredient of Draco is an encoding of design guidelines, realized as *constraints* over the logical facts. The constraints restrict which facts can appear together and which combinations we might prefer. For example, a constraint might express a preference that if a *bar* mark is used to encode quantitative data, the corresponding axis should include zero. + +Draco has three different categories of constraints. First, there are constraints that restrict the domain of attributes to ensure a valid Vega-Lite program. For example, a visualization cannot use *bar* and *point* for the *mark* type at the same time. (Here we restrict ourselves to single charts without layering or composition.) A visualization can also not use an arbitrary undefined mark type such as *wiggly-surface*; we restrict the domain to valid mark types supported by Vega-Lite such as *bar*, *line*, *area*, and *point*. We have similar restrictions for the channel, field, aggregation, and encoding parameters. + +Below is the ASP code for specifying restrictions to a mark type. Don’t worry too much about the exact syntax. The :- symbol can be read as “it cannot be the case that …”. A comma (,) indicates conjunction (logical and). + +*% possible values of mark type*
+*marktype(point;bar;line;area;text;tick;rect).* + +*% Do not allow the mark to be M when M is not a valid mark type.*
+*:- mark(M), not marktype(M).* + +The second category of constraints limits which logical facts can appear together in a valid visualization specification. As a simple example, we never want two facts of the type mark. We can also define more nuanced rules. For example, we can specify that only continuous encodings can be aggregated, or that a bar chart requires an x or y encoding. Again, as we want to formally reason about the logical facts and constraints, we express them in ASP. Below we restrict the number of mark facts to be exactly one. + +*% Allow exactly one mark type.*
+*{ mark(M) : marktype(M) } = 1.* + +To specify that bar charts must use an *x* or *y* encoding, we write a constraint over the *mark* and *channel* facts. An underscore (_) indicates that we don’t care about the exact value at this location. + +*% Point, tick, and bar require x or y channel.*
+*:- mark(point;tick;bar), not channel(*,x), not channel(*,y).* + +In summary, constraints limit what logical facts can appear together. We can use them to ensure that the specified visualizations in Draco are valid encodings that we can render using Vega-Lite. + +At this point, we might be tempted to start using these constraints to also describe design guidelines. However, if we treat all design guidelines as *hard* (non-negotiable) constraints, we would exclude a number of visualizations that we might want in special cases. For example, consider what would happen if we absolutely forbid quantitative axis scales that do not include zero: any time series of year-over-year values would always have to start at the year zero, we could never use log scales, and many other valid cases would be excluded. Instead of using constraints that must be satisfied, we can instead use *soft* constraints. Unlike hard constraints, it is acceptable for soft constraints to be violated. We instead apply a *penalty* (or cost) for each soft constraint that is not satisfied, allowing us to express *preferences* (rather than fixed rules) over the space of possible designs. + +Here is the ASP code for a general guideline that indicates a preference to include zero within the domain of an axis scale: + +*:~ encoding(E), not zero(E). [1]* + +Notice some changes to the syntax here. The starting segment switched from *:-* to *:~* and we added the number *[1]* at the end. You can read this soft constraint as “We prefer not to have an encoding E that also does not use zero. If we violate this rule, we incur a cost of 1.” Of course, this guideline is overly general, applying to all encodings; so we can use a more specific rule: + +*:~ continuous(E), not zero(E). [1]* + +This rule says that we only want to apply the soft constraint to continuous quantitative fields. We then add some hard constraints specifically for log scales and bar or area charts, where not including zero would imply incorrect comparisons. + +*% log and zero cannot be used together.*
+*:- log(E), zero(E).* + +*% Bar and area marks require continuous scales to start at zero.*
+*:- mark(bar;area), channel(E,x), orientation(horizontal), not zero(E).*
+*:- mark(bar;area), channel(E,y), orientation(vertical), not zero(E).*
+ +While defining all of these constraints may seem cumbersome, the result is a thorough and formal specification of “design guidelines” that can be applied and tested automatically. As we will see, this opens up a number of benefits! + +## Using Draco’s Knowledge Base for Critique and Recommendation + +So far we have discussed how we can specify visualizations and design guidelines as facts and constraints over these facts in Draco. The beauty of using constraints is that we can now use existing constraint solvers to find violations and reason about visualization design. + +For example, we can translate a Vega-Lite visualization specification into a set of Draco facts. We can then use a constraint solver to check the specification against all design rules and notify the user if the chart violates any particular guideline. We can use the soft constraint weights to determine the severity of any violations. This support is similar to spelling and grammar checkers in word processors, allowing us to automatically alert a visualization creator to potential issues that might hamper accurate chart reading. + +![](../blog-assets/images/2018-10-22-draco-2.webp) +*Figure 2: Our implementation of the encoding search process using constraints. Draco compiles a user query (including the dataset, the partial specification, and the task) into a set of rules and combines them with the existing knowledge base to form an ASP program. Draco then calls Clingo to solve the program to obtain the optimal answer set. Finally, Draco translates the answer set into a Vega-Lite specification.* + +Another powerful application of constraints is to help designers create more effective designs in the first place. Instead of asking the user to provide a complete specification, we can use the solver to auto-complete a partial specification, as illustrated in Figure 2. The solver uses the hard constraints to determine the search space of all valid visualization specifications, and then uses the soft constraints to find the more preferred encodings within that space: those that accrue the lowest overall penalty (sum of costs). The results can then be presented as a ranked list of charts for the user to examine. + +In this way, Draco can serve as a “design assistant” that makes suggestions to the user. The user is then free to use or override the results as they see fit. We have used Draco to re-implement the [CompassQL recommender engine](http://idl.cs.washington.edu/papers/compassql) that powers our [Voyager exploratory visualization interfaces](http://idl.cs.washington.edu/papers/voyager2/). Not only does the Draco implementation require less code (declarative constraints rather than imperative JavaScript code!), the use of a modern, optimized solver means that the resulting system is also immensely more scalable. + +## Improving Guideline Preferences via Machine Learning + +Draco’s “knowledge” of visualization design is encoded in the hard and soft constraints. While it is comparatively easy for an expert to specify what visualizations are non-sensible and what design rules exist, trading off between potentially competing design rules is much more challenging. Moreover, in general such trade-offs are far from universal: particular domains or organizations may have different conventions or preferences. + +![](../blog-assets/images/2018-10-22-draco-3.webp) +*Figure 3: Overview of Learning-to-Rank in Draco. Given visualization pairs in which one chart is preferable to the other, we learn soft constraint weights that best match the observed pairs.* + +The trade-offs between different design guidelines are determined by the weights of the soft constraints. Changes to the soft constraint weights can result in different recommendations from the Draco system. If using Draco for automated visualization design to support data analysis, we want Draco to produce *effective* visualizations that people are more likely to read quickly and accurately. Rather than fine-tuning the costs for all soft constraints by hand, we can determine weights that produce effective encodings by learning from experimental studies of visual encoding effectiveness. + +In the Draco paper, we demonstrate this approach by learning soft constraint weights from data gathered from [two](https://idl.cs.washington.edu/papers/task-data-effectiveness) [separate](https://arxiv.org/pdf/1709.08546) studies on visualization design. We first build a training dataset of visualization pairs, where the experiment results strongly suggest that one visualization in a pair should be preferred to another. We then apply a learning-to-rank approach using a linear [Support Vector Machine](https://en.wikipedia.org/wiki/Support_vector_machine) (SVM), updating the weights until Draco correctly orders as many visualization pairs as possible (Figure 3). On a set of unseen test data, we achieve 96% ranking accuracy. Despite using a linear learning algorithm, Draco can implicitly express non-linear relationships as the soft constraints can be defined over multiple attributes of a visualization. Please see the [Draco paper](http://idl.cs.washington.edu/papers/draco/) and our [vision of using machine learning for visualization design](https://arxiv.org/pdf/1807.06641.pdf) if you are curious about the details! + +## The Future of Draco + +Though promising, Draco’s visualization language is still admittedly modest, and can only express single charts. We are now working on extending the language to support multi-view graphics and interactive charts. We see our current set of logical facts and constraints as the starting point for an evolving knowledge base that can be refined, extended, and tested by researchers and practitioners. To facilitate this extension, we are first developing tools to browse and experiment with Draco’s knowledge base. We will then go on to build tools to observe and collect user actions in order to continuously learn preference weights and adaptively improve Draco’s suggestions. An important aspect of this work will be to integrate Draco into popular visual analysis tools such as the Python [Altair](https://altair-viz.github.io/) bindings for Vega-Lite. + +![](../blog-assets/images/2018-10-22-draco-4.webp) +*Figure 4: An experimental Draco editor with which users can edit the logical facts that describe a partial query and browse the completions generated by the constraint solver.* + +To get started using Draco, please visit the [Draco project website](https://uwdata.github.io/draco/). From there, you can read our research paper, browse the constraints, inspect our learning-to-rank implementation, and experiment with our [online editor](https://uwdata.github.io/draco-editor/) (Figure 4), where you can try Draco without installing anything. + +We are just at the start of exploring and improving Draco’s capabilities, and welcome your thoughts and contributions! + +*This article was authored by [Dominik Moritz](https://www.domoritz.de/) and [Jeffrey Heer](http://jheer.org/).* + +*Draco Project Website and Software: [https://uwdata.github.io/draco/]* + +*Draco InfoVis 2018 Paper: [http://idl.cs.washington.edu/papers/draco/]* diff --git a/static/blog-assets/posts/2019-08-12-errudite.md b/static/blog-assets/posts/2019-08-12-errudite.md new file mode 100644 index 0000000..10c495e --- /dev/null +++ b/static/blog-assets/posts/2019-08-12-errudite.md @@ -0,0 +1,189 @@ +--- +date: 2019-08-12 +title: "Errudite: Scalable, Reproducible, and Testable Error Analysis" +paper: errudite +--- + +

Error analysis is a compass, and we need it to be accurate.

+ +Error analysis — the attempt to analyze when, how, and why machine-learning models fail — is a crucial part of the development cycle: Researchers use it to suggest directions for future improvement, and practitioners make deployment decisions based on it. **Since error analysis profoundly determines the direction of subsequent actions, we cannot afford it to be biased or incomplete.** + +But how are people doing error analysis today? If you read some quotes from ACL papers (a top conference for NLP, or Natural Language Processing), this is what you see: + +- *“We performed an error analysis on a sample of 100 questions.”* +- *We randomly select 50 incorrect questions and categorize them into 6 classes.* +- *We sample 100 incorrect predictions and try to find common error categories.* +- *…* + +Apparently, the community has converged to this shared method: + +
+“We randomly select 50–100 incorrect questions and roughly label them into N error groups.” +
+ +This seems reasonable; what could go wrong? A lot, it turns out. For example, a 50–100 sample size is too small, frequently covering less than 5% of the total errors. Such small samples are likely unrepresentative of the true error distribution. It could be disastrous if we deploy a model only because our sample happens to underestimate a crucial model deficiency (chatbot assistants making inappropriate replies to certain kinds of queries missing from the error sample, etc.). + +Small sample size is just the first problem with the standard approach. In our ACL 2019 paper, we enumerate several key challenges for NLP error analysis, and raise three principles to advocate for *a more precise and reproducible, scalable, and testable procedure*. We also design *Errudite*, an interactive tool that instantiates these principles and addresses the problems with common ad hoc approaches to error analysis. + +Below, we will walk through one specific case, and show how manual, subjective inspection of a small sample of errors can be ambiguous, biased, and miss the root cause of errors. We will also show how Errudite helps avoid pitfalls. Please [see our paper](http://idl.cs.washington.edu/papers/errudite) for more cases, or [watch this video](https://youtu.be/Dil5i0AYyu8) for a demo! + +![](../blog-assets/images/2019-08-12-errudite-1.webp) +*The Errudite interface, with all the features we introduce next with our running example: (A) model overview; (B) attribute histograms; (c) filtering panel for users to specify queries with our domain-specific language, (D) instance list displaying filtered examples; (E) list of saved groups and (F) rewrite rules.* + +## The Scenario + +Let’s use Errudite to analyze a well known Machine Comprehension (MC) baseline model: [BiDAF](https://allenai.github.io/bi-att-flow/). Given a question and a context paragraph, a MC model is supposed to find a snippet of the context that correctly answers the question. + +![](../blog-assets/images/2019-08-12-errudite-2.webp) + +In this example (from SQuAD, a well-known MC dataset), Murray Gold (bolded, ground truth) created the 2005 theme for Doctor Who. + +As one of the most important testbeds for language understanding, MC error analysis is crucial yet challenging: experts are eager to find the weak spots of the state-of-the-art models, but with both inputs (question and context) and output (answer) to MC being unstructured text, they have limited features to break down and inspect the dataset. BiDAF is common enough in MC that most domain experts are familiar with it. In fact, our example comes from our real conversations with MC experts, in which we asked them to evaluate BiDAF’s strengths and weaknesses. + +In the example above, BiDAF makes a mistake, predicting John Debney (underlined) instead of Murray Gold. *We want to generalize from this one error, and understand our model performance more globally.* + +Our first question seeing this error is: *Why does the model make this mistake?* One hypothesis our domain experts came up with is the **Distractor Hypothesis**: + +
+BiDAF is good at matching questions to named entity types 😄, but is often distracted by other spans with the same entity type 😞. +
+ +More specifically, in our example, when we ask “who”, BiDAF knows we are asking for a person name and gives us one, but it’s not the correct one. + +With this hypothesis at hand, our second question is then, *Does the model make similar mistakes often?* Understanding the prevalence of an error hypothesis requires us to inspect more similar instances. As we’ve mentioned before, unstructured texts have limited features for exploring the dataset, and so people tend to group a subset of “distractors” by manually labeling error samples. + +The problem is, an error group defined in this way is subjective, and experts can easily disagree on what one group name means. Imagine a corner case of the distractor error: the ground truth answer to a “when” question is “during his college year,” whereas our model returns a wrong year, “1996.” Some people consider this to be a “distractor” problem as the question type (*“when”*) and the predicted named entity (*year*) matches perfectly. However, others might categorize it as something else, because the ground truth is not a recognizable named entity. You won’t even realize this difference if you just see the name and text description of the error cause. + +In fact, in our user study we observed such inconsistencies even for simple group definitions (please see our paper for details!): When given identical descriptions of an error type from a prior published analysis and asked to reproduce it, our expert users produced groups that vary in size from 13.8% to 45.2% of all errors — which further illustrates the ambiguity in subjective manual labeling. This leads us to our first principle: + +
+P1: Error hypotheses should be defined precisely with concrete descriptions. +
+ +* * * + +## Principle 1: Be precise. + +![](../blog-assets/images/2019-08-12-errudite-3.webp) + +To overcome manual subjectivity and be more precise, Errudite uses a *Domain-Specific Language (DSL)* to quantify instances. + +![](../blog-assets/images/2019-08-12-errudite-4.webp) + +In short, the DSL applies a list of powerful *extractors*, on *targets* of an instance, with additional supporting *operators*. A simple case is to extract the character length of a question, and require it to be larger than 20. + +Combinations of these three building blocks support the extraction of *attributes* from instances in various ways, so experts can answer the “how prevalent” question by *objectively and systematically* grouping instances with particular patterns (e.g., via filters on attributes). + +We use the following query to define distractor errors: + +![](../blog-assets/images/2019-08-12-errudite-5.webp) + +These lines can be broken down into several semantically meaningful conditions: + +- Line 1, the ground truth is an **ENT**ity, like “PERSON.” +- Line 2–3, there are more tokens matching the ground truth entity type in the whole context than in the ground truth alone. Together with line 1, it will mean *“find instances that have a potential distractor in the context.”* +- Line 4, the prediction entity type matches the ground truth one. Line 1–4 would mean, *“our model finds a correct entity type.”* +- And finally, line 5, the prediction is incorrect, so that lines 1–5 define *a group of instances that are distracted*. + +In contrast to just saying “distractor,” if you share these five lines to other researchers, they can precisely know we’ve excluded the previously described corner case. (To include it, our query might express a match between the question type — who, when, etc. — and the predicted named entity type.) + +* * * + +## Principle 2: Cover all the data. + +![](../blog-assets/images/2019-08-12-errudite-6.webp) + +Applying our filters, 192 instances in the group are cases where BiDAF predicts a wrong span, but this span has the same entity type as the ground truth. Note that in addition to this precision, the DSL also makes error analysis scalable: our query filter for just one error category already exceeds the 50–100 sampling convention, which reduces the sampling error. + +![](../blog-assets/images/2019-08-12-errudite-7.webp) + + +These 192 instances cover almost 6% of all BiDAF errors in our validation set. Looks convincing, right? This distractor hypothesis seems to be pretty solid! Should we now go and try to fix this problem in BiDAF? + +![](../blog-assets/images/2019-08-12-errudite-8.webp) + +If we apply all the partial filters and build all the other groups, we notice a different pattern: BiDAF predicts the exact correct span 68% of the time overall, which rises to 80% when the ground truth is an entity. When other entities with the same type are present in the passage, BiDAF is still 79% accurate (i.e., it is not particularly worse when there are potential distractors), and when it predicts an entity with the correct type, it is quite accurate — 88% of the time! This is much higher than the 68% exact match overall. *This means BiDAF actually performs better when it has distractors and the entity type is matched.* So if you just see the errors and decide this is a very important thing to fix, think twice, or you might miss something even bigger. + +So, the DSL helps cover the entire dataset, including the correct instances. The error analysis is more systematic and scalable this way, and can give you different conclusions when compared to looking at a small sample of mistakes only. We formally state our second principle as: + +
+P2: Error prevalence should be assessed over the entire dataset — including the true positive (non-error) examples. +
+ +* * * + +## Principle 3: Test error hypotheses to assess causality. + +![](../blog-assets/images/2019-08-12-errudite-9.webp) + +Now, we’ve defined groups related to distractors. But, the presence of distractors in a wrong prediction *does not necessarily* indicate that distractors were the root cause of the mistake. Turning back to our previous example, it’s easy to assume that it is wrong due to the distractor, but maybe it’s because we need to do multi-sentence reasoning to link “Doctor Who” with “the series”, or perhaps something else. + +This leads to one more problem in the status-quo: *We cannot effectively isolate the true cause of an error.* To dig into root causes, we state a third principle: + +
+P3: Error hypotheses should be explicitly tested. +
+ +In Errudite, we help answer this question, *“Are the 192 instances really wrong because of the distractor?”*, by asking a related what-if question: *“If the predicted distractor was not there, would the model predict correctly?”* We answer this question though counterfactual analysis with rewrite rules. + +![](../blog-assets/images/2019-08-12-errudite-10.webp) + +Leveraging our Domain Specific Language, Errudite uses rules to rewrite all instances in a group. We can verify whether distractors are causing mistakes by using a rewrite rule on the *is_distracted* group: We rewrite the *context* by replacing the *model’s predicted distractor string*. We swap in a meaningless placeholder *“#”*, so it won’t be detected as an entity anymore. + +Once rewritten, we ask our model to perform prediction again. In our previous example (the first below), with “John Debney” replaced, we now get a different distractor prediction — “Ron Grainer.” It seems that another distractor is still confusing the model! + +![](../blog-assets/images/2019-08-12-errudite-11.webp) + +As for the rest of the group: *changing to a different incorrect entity* occurs 29% of the time. Another 48% of the time, the predictions were corrected, so indeed the distractors were causing flawed predictions. However, for the remaining 23%, the model predicts the same span as before, except now this contains the meaningless hash token! Some other factors are likely at play: Maybe the predicted sentences heavily overlap with the question, and almost forced the model to do a naive token matching rather than searching for entities. This kind of counterfactual analysis helps develop insights not available through grouping alone. + +
+ +## Precise + Reproducible + Re-applicable + +What do we get out of this running story? Throughout the analysis, we were able to build attributes, groups, and rewrite rules with precise queries. + +![](../blog-assets/images/2019-08-12-errudite-12.webp) + +We applied them to BiDAF, and found that BiDAF is not particularly bad at distractors, and that errors seemingly due to distractors may be wrong for other reasons. Beyond that, with the queries saved, we can easily *share* them, and *re-apply* them to the same or different models or data whenever we want. + +
+ +## Bonus: User Interface Functionality + +Errudite provides a graphical user interface that not only integrates the analysis process, but also provides additional exploration support such as visualizing data distributions, suggesting potential queries, and presenting the grouping and rewriting results. + +![](../blog-assets/images/2019-08-12-errudite-13.webp) + +### Suggestion via Programming-by-Demonstration + +To make it easier to write DSL queries, we use [programming by demonstration](https://en.wikipedia.org/wiki/Programming_by_demonstration) to generalize extraction patterns. When one interactively selects certain tokens, Errudite provides a list of potentially relevant queries to help find related instances. + +![](../blog-assets/images/2019-08-12-errudite-14.webp) + +### Attribute distribution + +To guide exploration, group creation, and refinement, Errudite supports defining complex attributes (in this figure, the entity type of the ground truth answer), and inspecting their distributions in and out of created groups. + +* * * + +## A Recap! + +![](../blog-assets/images/2019-08-12-errudite-15.webp) + +In this work, we characterize deficiencies with current error analysis methods in NLP: they are subjective, they are laborious, they ignore the correct cases, and they do not have tests. Errudite does a line-by-line response, making the error analysis more precise, reproducible, scalable, and testable. + +### Error Analysis Beyond NLP + +The current Errudite implementation (specifically, its DSL) focuses on NLP. However, machine learning systems also impact our lives through non-textual media (thing of self-driving cars, power networks, *etc.*). We are convinced that, to help make the right decisions — deploy the right model, pursue the right research direction — our three principles can, and *should*, be applied to more areas than NLP. Similar tools putting these principles into practice can be easily built, so long as they support: (1) building precise instance groups with composable building blocks in a domain-specific language; (2) scaling the analysis to cover all the relevant successes and failures by automatically building large groups with filtering queries, and providing visual summaries for them; and (3) testing error hypotheses using counterfactual analysis by rewriting the instances with rules. + +For instance, we can imagine the Computer Vision domain benefiting from a DSL that supports object detection. Various perturbation methods have been “rewriting” images to test model robustness, and we believe that they could be modified for the purpose of understanding why models fail in certain groups. + +## Error Analysis for Your Own Tasks + +The Errudite software is architected to be extensible to other NLP tasks. If you would like to try it out, please visit the [Errudite repo](https://github.com/uwdata/errudite), and see the tutorials! + +*This article was authored by [Tongshuang (Sherry) Wu](https://homes.cs.washington.edu/~wtshuang/), [Marco Tulio Ribeiro](https://homes.cs.washington.edu/~marcotcr/), [Jeffrey Heer](https://homes.cs.washington.edu/~jheer/), and [Dan Weld](https://www.cs.washington.edu/people/faculty/weld).* + +*Errudite Open Source repo: [https://github.com/uwdata/errudite]* + +*Errudite ACL 2019 Paper: [http://idl.cs.washington.edu/files/2019-Errudite-ACL.pdf]* diff --git a/static/blog-index.json b/static/blog-index.json new file mode 100644 index 0000000..a4f577b --- /dev/null +++ b/static/blog-index.json @@ -0,0 +1,198 @@ +[ + { + "meta": { + "date": "2019-08-12", + "display_date": "Aug 12, 2019", + "title": "Errudite: Scalable, Reproducible, and Testable Error Analysis", + "web_name": "2019-08-12-errudite", + "paper": "errudite", + "recent": true + }, + "post": "Error analysis is a compass, and we need it to be accurate.\nError analysis — the attempt to analyze ", + "first_image": "blog-assets/images/2019-08-12-errudite-1.webp" + }, + { + "meta": { + "date": "2018-10-22", + "display_date": "Oct 22, 2018", + "title": "Draco: Representing, Applying & Learning Visualization Design Guidelines", + "web_name": "2018-10-22-draco", + "paper": "draco", + "recent": true + }, + "post": "From academic courses to online articles, discussions of visualization often abound with design guid", + "first_image": "blog-assets/images/2018-10-22-draco-1.webp" + }, + { + "meta": { + "date": "2018-10-16", + "display_date": "Oct 16, 2018", + "title": "Hypothetical Outcome Plots (HOPs) Help Users Separate Signal from Noise", + "web_name": "2018-10-16-hops", + "paper": "hops-trends", + "recent": true + }, + "post": "\nIn daily life, we often find ourselves trying to separate signal from noise. For example, does the ", + "first_image": "blog-assets/images/2018-10-16-hops-1.webp" + }, + { + "meta": { + "date": "2018-07-19", + "display_date": "Jul 19, 2018", + "title": "Value-Suppressing Uncertainty Palettes", + "web_name": "2018-07-19-value-suppressing", + "headliner": "The real world is full of uncertainty, but it can be tough to communicate that uncertainty.", + "paper": "uncertainty-palettes", + "recent": true + }, + "post": "\n\n\n\nFig. 1. A bivariate map (left) and a Value-Suppressing Uncertainty Palette (VSUP, right), showin", + "first_image": "blog-assets/images/2018-07-19-value-suppressing-1.webp" + }, + { + "meta": { + "date": "2018-04-02", + "display_date": "Apr 2, 2018", + "title": "Multiple Perspectives on the Multiple Comparisons Problem in Visual Analysis", + "web_name": "2018-04-02-multi-comparison", + "banner": "../blog-assets/images/2018-04-02-multi-comparison-1.webp", + "headliner": "The more visual comparisons an analyst makes, the more likely they are to find spurious patterns — a version of the Multiple Comparisons Problem (MCP) well known in statistical hypothesis testing.", + "external": "https://medium.com/hci-design-at-uw/multiple-perspectives-on-the-multiple-comparisons-problem-in-visual-analysis-df7493818bbd", + "recent": true + }, + "post": "This is an external post.\n", + "first_image": "blog-assets/images/2018-04-02-multi-comparison-1.webp" + }, + { + "meta": { + "date": "2017-10-31", + "display_date": "Oct 31, 2017", + "title": "Introducing Vega-Lite 2.0", + "web_name": "2017-10-31-vegalite2", + "banner": "../blog-assets/images/2017-10-31-vegalite2-banner.webp", + "paper": "vega-lite", + "recent": false + }, + "post": "We are excited to announce the official version 2 release of Vega-Lite, a high-level language for ra", + "first_image": "blog-assets/images/2017-10-31-vegalite2-banner.webp" + }, + { + "meta": { + "date": "2017-07-11", + "display_date": "Jul 11, 2017", + "title": "Explaining the Gap: Visualizing One’s Predictions Improves Recall and Comprehension of Data", + "web_name": "2017-07-11-gap", + "banner": "../blog-assets/images/2017-07-11-gap-1.webp", + "headliner": "What if Visualizations Asked Users to Predict the Data First?", + "external": "https://medium.com/hci-design-at-uw/explaining-the-gap-visualizing-ones-predictions-improves-recall-and-comprehension-of-data-ec848d5861d9", + "paper": "explaining-the-gap", + "recent": false + }, + "post": "This is an external post.\n", + "first_image": "blog-assets/images/2017-07-11-gap-1.webp" + }, + { + "meta": { + "date": "2017-05-23", + "display_date": "May 23, 2017", + "title": "GraphScape: Modeling Similarity & Sequence among Charts", + "web_name": "2017-05-23-graphscape", + "banner": "../blog-assets/images/2017-05-23-graphscape-1.gif", + "headliner": "A single chart is often not enough to understand data and to convey a story.", + "external": "https://medium.com/hci-design-at-uw/graphscape-modeling-similarity-sequence-among-charts-bd82cdbe866d", + "paper": "graphscape", + "recent": false + }, + "post": "This is an external post.\n", + "first_image": "blog-assets/images/2017-05-23-graphscape-1.gif" + }, + { + "meta": { + "date": "2017-05-02", + "display_date": "May 2, 2017", + "title": "Regression by Eye", + "web_name": "2017-05-02-regression-by-eye", + "paper": "regression-by-eye", + "recent": false + }, + "post": "William Playfair was an early pioneer of information visualization. Here is one of his charts, a 178", + "first_image": "blog-assets/images/2017-05-02-regression-by-eye-1.webp" + }, + { + "meta": { + "date": "2016-09-27", + "display_date": "Sep 27, 2016", + "title": "Surprise Maps: Showing the Unexpected", + "web_name": "2016-09-27-surprise", + "paper": "surprise-maps", + "recent": false + }, + "post": "\nIn 1977, Jerry Ehman — an astronomer working with the SETI project to seek out alien life — came ac", + "first_image": "blog-assets/images/2016-09-27-surprise-1.webp" + }, + { + "meta": { + "date": "2016-07-21", + "display_date": "Jul 21, 2016", + "title": "Atlas of Me: Personalized Spatial Analogy Maps for Unfamiliar Measurements", + "web_name": "2016-07-21-atlas", + "banner": "../blog-assets/images/2016-07-21-atlas-1.webp", + "headliner": "We created Atlas of Me, a Chrome plugin that generates personalized spatial analogy maps for distances and areas.", + "external": "https://medium.com/hci-design-at-uw/atlas-of-me-personalized-spatial-analogy-maps-for-unfamiliar-measurements-e20566d94b52", + "recent": false + }, + "post": "This is an external post.\n", + "first_image": "blog-assets/images/2016-07-21-atlas-1.webp" + }, + { + "meta": { + "date": "2016-04-06", + "display_date": "Apr 6, 2016", + "title": "Author’s Response to Stephen Few’s critique of Hypothetical Outcome Plots", + "web_name": "2016-04-06-resposne", + "recent": false + }, + "post": "Hypothetical outcome plots (HOPs) are an approach to visualizing uncertainty using a set of discrete", + "first_image": null + }, + { + "meta": { + "date": "2016-02-23", + "display_date": "Feb 23, 2016", + "title": "Introducing Vega-Lite", + "web_name": "2016-02-23-vegalite", + "banner": "../blog-assets/images/2016-02-23-vegalite-banner.webp", + "headliner": "Today we are excited to announce the official 1.0 release of Vega-Lite, a high-level format for rapidly creating visualizations for analysis and presentation.", + "external": "https://medium.com/hci-design-at-uw/introducing-vega-lite-438f9215f09e", + "paper": "vega-lite", + "recent": false + }, + "post": "This is an external post.\n", + "first_image": "blog-assets/images/2016-02-23-vegalite-banner.webp" + }, + { + "meta": { + "date": "2016-01-26", + "display_date": "Jan 26, 2016", + "title": "Hypothetical Outcome Plots: Experiencing the Uncertain", + "web_name": "2016-01-26-hops", + "banner": "../blog-assets/images/2016-01-26-hops.gif", + "headliner": "If you are like most people, including many data analysts, interpreting visualizations of uncertainty feels hard and abstract.", + "external": "https://medium.com/hci-design-at-uw/hypothetical-outcomes-plots-experiencing-the-uncertain-b9ea60d7c740", + "paper": "hops", + "recent": false + }, + "post": "This is an external post.\n", + "first_image": "blog-assets/images/2016-01-26-hops.gif" + }, + { + "meta": { + "date": "2015-12-17", + "display_date": "Dec 17, 2015", + "title": "Next Steps for Data Visualization Research", + "web_name": "2015-12-17-next-steps", + "recent": false + }, + "post": "Given its youth and interdisciplinary nature, research methods and training in the field of data vis", + "first_image": "blog-assets/images/2015-12-17-next-steps-1.webp" + } +] \ No newline at end of file diff --git a/yarn.lock b/yarn.lock index bf772ee..4c3cd8c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3331,6 +3331,11 @@ yaml@^2.3.4: resolved "https://registry.npmjs.org/yaml/-/yaml-2.4.1.tgz" integrity sha512-pIXzoImaqmfOrL7teGUBt/T7ZDnyeGBWyXQBvOVhLkWLN37GXv8NMLK406UY6dS51JfcQHsmcW5cJ441bHg6Lg== +yaml@^2.7.1: + version "2.7.1" + resolved "https://registry.yarnpkg.com/yaml/-/yaml-2.7.1.tgz#44a247d1b88523855679ac7fa7cda6ed7e135cf6" + integrity sha512-10ULxpnOCQXxJvBgxsn9ptjq6uviG/htZKk9veJGhlqn3w/DxQ631zFF+nlQXLwmImeS5amR2dl2U8sg6U9jsQ== + yocto-queue@^0.1.0: version "0.1.0" resolved "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz"