g | x | w | all

Bytes	Lang	Time	Link
065	AWK	250916T160647Z	xrs
176	Go	230328T180037Z	bigyihsu
101	q	230328T211031Z	eohara-i
008	Stax	230328T145211Z	emirps
249	C gcc	230303T045404Z	ErikF
103	Lua	230305T125347Z	GulgDev
034	Python 3	230305T124340Z	GulgDev
nan	C GCC	230303T131920Z	Peter
050	jq	230228T195820Z	GammaFun
031	Mathematica Wolfram Language	230303T203503Z	dirvine
046	Factor	230228T183440Z	chunes
007	Vyxal	230303T183050Z	Robert
087	PHP 8.x	230302T221227Z	Ismael M
014	Japt	230228T174935Z	Shaggy
024	Perl p	230228T215752Z	naffetS
021	Raku	230301T220245Z	Sean
083	Java JDK	230228T211117Z	Unmitiga
032	Ruby n	230301T112133Z	Kirill L
031	Charcoal	230301T100130Z	Neil
015	Retina 0.8.2	230228T174039Z	Neil
012	05AB1E	230301T084317Z	Kevin Cr
027	APL Dyalog Extended	230301T044018Z	Programm
120	Excel	230228T214502Z	Jos Wool
054	Python	230228T215429Z	naffetS
007	Vyxal	230228T212749Z	AndrovT
023	J	230228T204509Z	south
035	Arturo	230228T201657Z	chunes
nan		230228T200416Z	The Thon
049	PowerShell Core	230228T193939Z	Julian
036	Zsh	230228T191829Z	GammaFun
046	JavaScript	230228T175920Z	Shaggy
028	Bash + GNU utils	230228T174407Z	Digital

AWK, 65 bytes

BEGIN{FPAT="_*[A-z]+_*"}{for(;i++<NF;)!b[tolower($i)]++&&x++}$0=x

Attempt This Online!

Go, 181 176 bytes

import(."regexp";."strings")
func f(s string)int{m:=make(map[string]int)
for _,w:=range MustCompile("\\w+").FindAllString(ToLower(s),-1){if _,o:=m[w];!o{m[w]=1}}
return len(m)}

Attempt This Online!

Gets all words matching regex \w+, and adds them to a map (acting as a set in this case). Then it returns the number of items in the map (set).

-5 bytes by @The Thonnu

q, 101 bytes

{(#:)(&:)min each not(^:)(?:){$[not(#:)(&:)x in .Q.an;0N;x]}each cut[(&:)0<>(-':)x in .Q.an;lower x]}

More verbose version:

{count where min each not null distinct {$[not count where x in .Q.an;" ";x]}each cut[where 0<>deltas x in .Q.an;lower x]}

Stax, 8 bytes

│ÿîIΔ»╝H

Run and debug it

This is a packed stax program. When unpacked, it is the following:

v"\w+"|Fu%

Run and debug it

Explanation

v          # lowercase the input string
      |F   # get all regex pattern matches of regex
 "\w+"     # \w+
        u  # uniquify
         % # length

C (gcc), 275 254 250 249 bytes

-26 bytes thanks to ceilingcat

To split the words, each uppercased word is stored in a list recursively. Duplicates are nulled out, preventing them from being scanned.

g(s,t,i)char*s,**t;{char*a[2]={0,t},**v,*u;for(;*s&&!isalnum(*s)&&*s-95;s++);u=*a=strdup(s);if(i=*u){for(;*u=*s&&isalnum(*s)|*s==95;*u++=~32&*s++);i=g(s,a);}else for(v=t;v;v=v[1])if(*v)for(i++,t=v;t=t[1];)*t&&!strcmp(*v,*t)?*t=0:0;s=i;}f(s){g(s,0);}

Try it online!

Ungolfed (with a structure instead of an array):

struct list { char *data; struct list *prev; };

int g(char *s, struct list *t) {
  int i;
  char *u;
  struct list a={0,t}, *v;

  for(;*s&&!(isalnum(*s)&&*s-'_');s++);
  u=a.data=strdup(s); // skip spaces and duplicate the string locally

  if(i=*u){ // collect the word and uppercase it
    for(;*u=*s&&isalnum(*s)|*s=='_';*u++=~32&*s++);
    i=g(s,&a); // recursively generate list
  }else // end of string: process the words
    for(v=t;v;v=v->prev) // from the end, work backwards
      if(v->data) // if not a duplicate
        for(i++,t=v;t=t->prev;) // scan for duplicates
          t->data&&!strcmp(v->data,t->data)?t->data=0:0; // null out duplicates

  s=i; // return the count
}

int f(char *s) { g(s,0); } // initialize the end of list and (implicitly) return the count

Lua, 103 bytes

function x(i)r=0m={}for c in i:lower():gmatch("%w+")do if not m[c]then m[c]=1r=r+1 end end return r end

Try it online!

Python 3, 34 bytes

lambda i:len({*i.lower().split()})

Try it online!

C (GCC), 156 + 48 + 22 = 226 bytes

-3 bytes thanks to ceilingcat

Use compiler flags -DW(a)=for(--s;a!isalnum(*++s)|*s==95;)*s|=32; and -DF(a)=for(a=0;a<i;++a).

char*v[99],*e;i,j,k;f(char*s){e=s+strlen(s);W()for(i=0;s<e;){v[i++]=s;W(!)W((*s=0)|)}F(j)F(k)strcmp(v[j],v[k])|j==k||(*v[j]=0);k=0;F(j)*v[j]&&++k;return k;}

Try It Online!

Explanation:

char*v[99],*e;
i,j,k;
f(char*s)
{
    // Set e to the end of s
    e=s+strlen(s);
    // Set s to the first character that is alphanumerical or an underscore.
    W()
    // While s hasn't moved past the end of the string ...
    for(i=0;s<e;) {
        // Store s in v[i], then increment i
        v[i++]=s;
        // Set s to the first character that isn't alphanumerical or an
        // underscore.
        W(!)
        // Set all characters between the current word and the next to 0, and
        // set s to the first character in the next word.
        W((*s=0)|)
    }
    // Iterate through the words with j.
    F(j)
        // Iterate through the words with k.
        F(k)
            // If words at index j and k are equal but j and k aren't the same
            // index, set the first character in word j to 0, marking the word
            // as a duplicate.
            strcmp(v[j],v[k])|j==k||(*v[j]=0);
    k=0;
    F(j)
        // For each word that hasn't been marked as a duplicate, increment k.
        *v[j]&&++k;
    // Return the number of words not marked as a duplicate.
    return k;
}

jq, 53 52 50 bytes

-1 byte by Neil for recognizing splitting on just \W works since we handle empty strings anyway. -2 bytes by me for realizing I could ascii_downcase first, saving a . and a |

ascii_downcase+":"|[splits("\\W")]|unique|length-1

~~Try it online! Try it online!~~ Try it online!

Equivalent 50 byte answer:

ascii_downcase+":"|split("\\W";"")|unique|length-1

Thanks to chune's answer for inspiring me to try golfing splits("\\W+") instead of match("\\W+";"g"). Turns out, despite having to work around the empty string being matched in some cases, it is two bytes shorter!

For those curious, here's the match method:

[match("\\w+";"g").string|ascii_downcase]|unique|length

Mathematica (Wolfram Language), 31 bytes

Length@*WordCounts@*ToLowerCase

The Mathematica built-in WordCounts treats a hyphenated word as a single word. So this program is not going to work correctly on the fourth example, "hello-world2". I discuss this in the comment below.

Factor, 47 46 bytes

[ >lower R/ \W/ re-split harvest cardinality ]

Try it online!

-1 byte thangs to GammaFunction

>lower convert input to lowercase
R/ \W/ re-split split on non-word characters
harvest remove empty strings
cardinality length without duplicates

Splitting and harvesting is shorter than simply getting a list of matches because the word for that (all-matching-slices) is super long. Not sure if there is a way to prevent the empty strings in pure regex, might be shorter.

Vyxal, 7 bytes

ɽøWǍUL‹

Try it Online!

Explanation:

ɽ       - Lowercases Input
 øW     - Groups string by words into a list
   Ǎ    - Removes all non-alphabetical items, leaving empty list spaces
    U   - Removes all non-unique list items
     L  - Gets length of list
      ‹ - Decrements by 1, to account for extra list item for first symbol

PHP 8.x, 87 bytes

This is a quite long piece of code...

fn($z)=>count(array_flip(array_filter(array_map('strtolower',preg_split('/\W+/',$z)))))

It's almost self-explanatory.

It's so void of PHP-only tricks that it is so trivial to re-implement it into JavaScript!

// PHP-like code
let fn = ($z)=>count(array_flip(array_filter(array_map('strtolower',preg_split('\\W+',$z)))));

// Test boilerplate that has enough functionality
function count(value) {
  return value.length;
}

function strtolower(str) {
  return str.toLowerCase();
}

function preg_split(regex, subject) {
  return subject.split(new RegExp(regex));
}

function array_filter(array) {
  return array.filter(function(value){
    return value !== '';
  });
}

function array_flip(array) {
  return Array.from(new Set(array));
}

function array_map(fn, array) {
  return array.map(typeof fn === 'string' ? window[fn] : fn);
}

// Event handler - does the basic output thing
text.oninput = function(){
  output.innerText = fn(this.value);
};

<input type="text" id="text"/>

<p>Output: <span id="output">--</span></p>

Differences

There aren't a lot of differences between the PHP and JavaScript versions:

The fn($z)=>[...] has to be written without the fn bit.
The regular expression has to be escaped and can't have the slashes.
This means it changes from /\W+/ to \\W+.
The function array_flip only returns a set of all unique values.
The PHP function returns the array with the keys set from the values.
That is, an array like ['a', 'fox', 'a', 'car'] will be returned as ['a' => 2, 'fox' => 1, 'car' => 3] while JavaScript returns ['a', 'fox', 'car'].
The end result is the same: an array that has the same number of unique elements.

These differences won't affect the accuracy of the results.
But, it's worth to deal with them to give you an improved testing environment.

Japt, 14 bytes

Yet another one of those occasions I regret suggesting the removal of _ from the \w RegEx class in Japt!

f"[%w_]+" üv l

Try it

f"[%w_]+" üv l     :Implicit input of string
f                  :Match
 "[%w_]+"          :  RegEx /[a-z0-9_]/gi
          ü        :Group & sort by
           v       :  Lowercase
             l     :Length

Perl `-p`, 24 bytes

$_=grep!$s{+lc}++,/\w+/g

Attempt This Online!

-1 byte thanks to Kjetil S

Raku, 21 bytes

+*.lc.comb(/\w+/).Set

Try it online!

This is an anonymous function. The argument (*) is converted to lowercase (.lc), then the substrings matching one or more word characters are extracted (.comb(/\w+/)) and converted to a set (.Set), which discards the duplicates. Finally, that set is expressed as a number (+), yielding its size.

Java (JDK), 83 bytes

s->java.util.Arrays.stream((" "+s).toLowerCase().split("\\W")).distinct().count()-1

Try it online!

Saved 17 bytes thanks to Neil.

Ruby `-n`, 32 bytes

p$_.upcase.scan(/\w+/).uniq.size

Attempt This Online!

Charcoal, 31 bytes

≔⦃⦄θＦΦ⪪↧Ｓ⁻⪪γ¹⊞ＯＥ³⁶⍘ιφ_ι§≔θιιＩＬθ

Attempt This Online! Link is to verbose version of code. Explanation: Inspired by @KevinCruijssen's 05AB1E answer, so assumes the input only contains printable ASCII.

≔⦃⦄θ

Start with an empty dictionary.

ＦΦ⪪↧Ｓ⁻⪪γ¹⊞ＯＥ³⁶⍘ιφ_ι

Split the lowercased input on all printable ASCII except the characters used to encode base 36 and _, and...

§≔θιι

... set each word as a key in the dictionary.

ＩＬθ

Output the final length of the dictionary.

Retina 0.8.2, 15 bytes

T`L`l
D`\w+
\w+

Try it online! Link includes test cases. Explanation:

T`L`l

Convert to lower case.

D`\w+

Deduplicate words.

\w+

Count the number of remaining words.

13 bytes in Retina 1:

D$`\w+
$l
\w+

Try it online! Link includes test cases. Explanation:

D$`\w+
$l

Deduplicate words by lowercased value.

\w+

Count the number of remaining words.

05AB1E, 12 bytes

žQžjмS¡õKlÙg

Assumes the input will only contain printable ASCII characters.

Try it online or verify all test cases.

Explanation:

žQ            # Push the constant string with all printable ASCII characters
  žj          # Push the constant string with "a-zA-Z0-9_"
    м         # Remove all those characters
     S        # Convert the string to a list of characters
      ¡       # Split the (implicit) input-string by each of those characters
       õK     # Remove any empty strings from the list
         l    # Convert each word to lowercase
          Ù   # Uniquify the list of lowercase words
           g  # Pop and push the length
              # (which is output implicitly as result)

APL (Dyalog Extended), 27 bytes

f←{⍴∪(⎕A,⎕D,'_')(∊⍨⊆⊢)1⎕C⍵}

Try it online!

f←{⍴∪(⎕A,⎕D,'_')(∊⍨⊆⊢)1⎕C⍵}
                        1⎕C      to uppercase
     (⎕A,⎕D,'_')                 [A-Z0-9_]
     (⎕A,⎕D,'_')(∊⍨⊆⊢)1⎕C       Partition (⊆) using Membership (∊)
    ∪                             remove duplicates
   ⍴                              count words

Excel, 120 bytes

=LET(
    x,MID(UPPER(A1),ROW(A:A),1),
    ROWS(
        UNIQUE(
            TEXTSPLIT(A1,,
                IF((ABS(77.5-CODE(x&"Z"))<13)+1-ISERR(0+x)+(x="_")=0,x),1
            )
        )
    )
)

Python, 54 bytes

lambda s:len({*re.findall('\w+',s.lower())})
import re

Attempt This Online!

Vyxal, 7 bytes

`†`ẎɽUL

Try it Online!

`†`Ẏ    # find all matches of \w+
    ɽ   # to lowercase
     U  # uniquify
      L # length

With a flag:

Vyxal `l`, 6 bytes

`†`ẎɽU

Try it Online!

J, 23 bytes

'\w+'#@~.@rxall tolower

Uniquify and count word matches in lowercase input.

Attempt This Online!

Arturo, 35 bytes

$=>[match lower&{/\w+}|unique|size]

Try it

Thunno, \$ 11 \log_{256}(96) \approx \$ 9.05 bytes

u"\w+"AfZUL

Attempt This Online!

Same approach as basically every other answer.

u"\w+"AfZUL  # Implicit input
u            # Uppercase
 "\w+"Af     # Regex findall "\w+"
        ZU   # Uniquify
          L  # Length

PowerShell Core, 49 bytes

($args-split'\W'-ne''|%{$_|% *l*r}|sort|gu).Count

Try it online!

($args-split'\W'-ne''|%{$_|% *l*r}|sort|gu).Count # full function
 $args-split'\W'-ne''                             # Splits the input string on non words character and remove empty entries
                     |%{$_|% *l*r}                # Calls ToLower() on each of the words
                                  |sort|gu        # Get unique words, Get-Unique needs the list to be sorted to remove all duplicates
(                                         ).Count # Return the count

Zsh, 36 bytes

<<<${#${(u)=${1:l}//[^[:IDENT:]]/ }}

Try it online!

Fortunately, the [:IDENT:] character class is exactly the words we should keep.

<<<${#${(u)=${1:l}//[^[:IDENT:]]/ }}
            ${1:l}                    # lowercase string
      ${          //[^[:IDENT:]]/ }   # // replace non-[:IDENT:] with spaces
      ${   =                      }   # = split on $IFS (space/tab/newline
      ${(u)                       }   # keep first occurance of each word
   ${#                             }  # count
<<<                                   # print

JavaScript, 46 bytes

s=>new Set(s.toLowerCase().match(/\w+/g)).size

Try it online!

Bash + GNU utils, 28

grep -Eo \\w+|sort -fu|wc -l

Try it online!

AWK, 65 bytes

Go, 181 176 bytes

q, 101 bytes

Stax, 8 bytes

Explanation

C (gcc), 275 254 250 249 bytes

Lua, 103 bytes

Python 3, 34 bytes

C (GCC), 156 + 48 + 22 = 226 bytes

jq, 53 52 50 bytes

Mathematica (Wolfram Language), 31 bytes

Factor, 47 46 bytes

Vyxal, 7 bytes

PHP 8.x, 87 bytes

Differences

Japt, 14 bytes

Perl -p, 24 bytes

Raku, 21 bytes

Java (JDK), 83 bytes

Ruby -n, 32 bytes

Charcoal, 31 bytes

Retina 0.8.2, 15 bytes

05AB1E, 12 bytes

APL (Dyalog Extended), 27 bytes

Excel, 120 bytes

Python, 54 bytes

Vyxal, 7 bytes

With a flag:

Vyxal l, 6 bytes

J, 23 bytes

Arturo, 35 bytes

Thunno, \$ 11 \log_{256}(96) \approx \$ 9.05 bytes

PowerShell Core, 49 bytes

Zsh, 36 bytes

JavaScript, 46 bytes

Bash + GNU utils, 28

Perl `-p`, 24 bytes

Ruby `-n`, 32 bytes

Vyxal `l`, 6 bytes