g | x | w | all
Bytes Lang Time Link
065AWK250916T160647Zxrs
176Go230328T180037Zbigyihsu
101q230328T211031Zeohara-i
008Stax230328T145211Zemirps
249C gcc230303T045404ZErikF
103Lua230305T125347ZGulgDev
034Python 3230305T124340ZGulgDev
nanC GCC230303T131920ZPeter
050jq230228T195820ZGammaFun
031Mathematica Wolfram Language230303T203503Zdirvine
046Factor230228T183440Zchunes
007Vyxal230303T183050ZRobert
087PHP 8.x230302T221227ZIsmael M
014Japt230228T174935ZShaggy
024Perl p230228T215752ZnaffetS
021Raku230301T220245ZSean
083Java JDK230228T211117ZUnmitiga
032Ruby n230301T112133ZKirill L
031Charcoal230301T100130ZNeil
015Retina 0.8.2230228T174039ZNeil
01205AB1E230301T084317ZKevin Cr
027APL Dyalog Extended230301T044018ZProgramm
120Excel230228T214502ZJos Wool
054Python230228T215429ZnaffetS
007Vyxal230228T212749ZAndrovT
023J230228T204509Zsouth
035Arturo230228T201657Zchunes
nan230228T200416ZThe Thon
049PowerShell Core230228T193939ZJulian
036Zsh230228T191829ZGammaFun
046JavaScript230228T175920ZShaggy
028Bash + GNU utils230228T174407ZDigital

AWK, 65 bytes

BEGIN{FPAT="_*[A-z]+_*"}{for(;i++<NF;)!b[tolower($i)]++&&x++}$0=x

Attempt This Online!

Go, 181 176 bytes

import(."regexp";."strings")
func f(s string)int{m:=make(map[string]int)
for _,w:=range MustCompile("\\w+").FindAllString(ToLower(s),-1){if _,o:=m[w];!o{m[w]=1}}
return len(m)}

Attempt This Online!

Gets all words matching regex \w+, and adds them to a map (acting as a set in this case). Then it returns the number of items in the map (set).

q, 101 bytes

{(#:)(&:)min each not(^:)(?:){$[not(#:)(&:)x in .Q.an;0N;x]}each cut[(&:)0<>(-':)x in .Q.an;lower x]}

More verbose version:

{count where min each not null distinct {$[not count where x in .Q.an;" ";x]}each cut[where 0<>deltas x in .Q.an;lower x]}

Stax, 8 bytes

│ÿîIΔ»╝H

Run and debug it

This is a packed stax program. When unpacked, it is the following:

v"\w+"|Fu%

Run and debug it

Explanation

v          # lowercase the input string
      |F   # get all regex pattern matches of regex
 "\w+"     # \w+
        u  # uniquify
         % # length

C (gcc), 275 254 250 249 bytes

To split the words, each uppercased word is stored in a list recursively. Duplicates are nulled out, preventing them from being scanned.

g(s,t,i)char*s,**t;{char*a[2]={0,t},**v,*u;for(;*s&&!isalnum(*s)&&*s-95;s++);u=*a=strdup(s);if(i=*u){for(;*u=*s&&isalnum(*s)|*s==95;*u++=~32&*s++);i=g(s,a);}else for(v=t;v;v=v[1])if(*v)for(i++,t=v;t=t[1];)*t&&!strcmp(*v,*t)?*t=0:0;s=i;}f(s){g(s,0);}

Try it online!

Ungolfed (with a structure instead of an array):

struct list { char *data; struct list *prev; };

int g(char *s, struct list *t) {
  int i;
  char *u;
  struct list a={0,t}, *v;

  for(;*s&&!(isalnum(*s)&&*s-'_');s++);
  u=a.data=strdup(s); // skip spaces and duplicate the string locally

  if(i=*u){ // collect the word and uppercase it
    for(;*u=*s&&isalnum(*s)|*s=='_';*u++=~32&*s++);
    i=g(s,&a); // recursively generate list
  }else // end of string: process the words
    for(v=t;v;v=v->prev) // from the end, work backwards
      if(v->data) // if not a duplicate
        for(i++,t=v;t=t->prev;) // scan for duplicates
          t->data&&!strcmp(v->data,t->data)?t->data=0:0; // null out duplicates

  s=i; // return the count
}

int f(char *s) { g(s,0); } // initialize the end of list and (implicitly) return the count

Lua, 103 bytes

function x(i)r=0m={}for c in i:lower():gmatch("%w+")do if not m[c]then m[c]=1r=r+1 end end return r end

Try it online!

Python 3, 34 bytes

lambda i:len({*i.lower().split()})

Try it online!

C (GCC), 156 + 48 + 22 = 226 bytes

-3 bytes thanks to ceilingcat

Use compiler flags -DW(a)=for(--s;a!isalnum(*++s)|*s==95;)*s|=32; and -DF(a)=for(a=0;a<i;++a).

char*v[99],*e;i,j,k;f(char*s){e=s+strlen(s);W()for(i=0;s<e;){v[i++]=s;W(!)W((*s=0)|)}F(j)F(k)strcmp(v[j],v[k])|j==k||(*v[j]=0);k=0;F(j)*v[j]&&++k;return k;}

Try It Online!

Explanation:

char*v[99],*e;
i,j,k;
f(char*s)
{
    // Set e to the end of s
    e=s+strlen(s);
    // Set s to the first character that is alphanumerical or an underscore.
    W()
    // While s hasn't moved past the end of the string ...
    for(i=0;s<e;) {
        // Store s in v[i], then increment i
        v[i++]=s;
        // Set s to the first character that isn't alphanumerical or an
        // underscore.
        W(!)
        // Set all characters between the current word and the next to 0, and
        // set s to the first character in the next word.
        W((*s=0)|)
    }
    // Iterate through the words with j.
    F(j)
        // Iterate through the words with k.
        F(k)
            // If words at index j and k are equal but j and k aren't the same
            // index, set the first character in word j to 0, marking the word
            // as a duplicate.
            strcmp(v[j],v[k])|j==k||(*v[j]=0);
    k=0;
    F(j)
        // For each word that hasn't been marked as a duplicate, increment k.
        *v[j]&&++k;
    // Return the number of words not marked as a duplicate.
    return k;
}

jq, 53 52 50 bytes

-1 byte by Neil for recognizing splitting on just \W works since we handle empty strings anyway. -2 bytes by me for realizing I could ascii_downcase first, saving a . and a |

ascii_downcase+":"|[splits("\\W")]|unique|length-1

Try it online! Try it online! Try it online!

Equivalent 50 byte answer:

ascii_downcase+":"|split("\\W";"")|unique|length-1

Thanks to chune's answer for inspiring me to try golfing splits("\\W+") instead of match("\\W+";"g"). Turns out, despite having to work around the empty string being matched in some cases, it is two bytes shorter!

For those curious, here's the match method:

[match("\\w+";"g").string|ascii_downcase]|unique|length

Mathematica (Wolfram Language), 31 bytes

Length@*WordCounts@*ToLowerCase

The Mathematica built-in WordCounts treats a hyphenated word as a single word. So this program is not going to work correctly on the fourth example, "hello-world2". I discuss this in the comment below.

Factor, 47 46 bytes

[ >lower R/ \W/ re-split harvest cardinality ]

Try it online!

-1 byte thangs to GammaFunction

Splitting and harvesting is shorter than simply getting a list of matches because the word for that (all-matching-slices) is super long. Not sure if there is a way to prevent the empty strings in pure regex, might be shorter.

Vyxal, 7 bytes

ɽøWǍUL‹

Try it Online!

Explanation:

ɽ       - Lowercases Input
 øW     - Groups string by words into a list
   Ǎ    - Removes all non-alphabetical items, leaving empty list spaces
    U   - Removes all non-unique list items
     L  - Gets length of list
      ‹ - Decrements by 1, to account for extra list item for first symbol

PHP 8.x, 87 bytes

This is a quite long piece of code...

fn($z)=>count(array_flip(array_filter(array_map('strtolower',preg_split('/\W+/',$z)))))

It's almost self-explanatory.

It's so void of PHP-only tricks that it is so trivial to re-implement it into JavaScript!

// PHP-like code
let fn = ($z)=>count(array_flip(array_filter(array_map('strtolower',preg_split('\\W+',$z)))));

// Test boilerplate that has enough functionality
function count(value) {
  return value.length;
}

function strtolower(str) {
  return str.toLowerCase();
}

function preg_split(regex, subject) {
  return subject.split(new RegExp(regex));
}

function array_filter(array) {
  return array.filter(function(value){
    return value !== '';
  });
}

function array_flip(array) {
  return Array.from(new Set(array));
}

function array_map(fn, array) {
  return array.map(typeof fn === 'string' ? window[fn] : fn);
}

// Event handler - does the basic output thing
text.oninput = function(){
  output.innerText = fn(this.value);
};
<input type="text" id="text"/>

<p>Output: <span id="output">--</span></p>


Differences

There aren't a lot of differences between the PHP and JavaScript versions:

These differences won't affect the accuracy of the results.
But, it's worth to deal with them to give you an improved testing environment.

Japt, 14 bytes

Yet another one of those occasions I regret suggesting the removal of _ from the \w RegEx class in Japt!

f"[%w_]+" üv l

Try it

f"[%w_]+" üv l     :Implicit input of string
f                  :Match
 "[%w_]+"          :  RegEx /[a-z0-9_]/gi
          ü        :Group & sort by
           v       :  Lowercase
             l     :Length

Perl -p, 24 bytes

$_=grep!$s{+lc}++,/\w+/g

Attempt This Online!

-1 byte thanks to Kjetil S

Raku, 21 bytes

+*.lc.comb(/\w+/).Set

Try it online!

This is an anonymous function. The argument (*) is converted to lowercase (.lc), then the substrings matching one or more word characters are extracted (.comb(/\w+/)) and converted to a set (.Set), which discards the duplicates. Finally, that set is expressed as a number (+), yielding its size.

Java (JDK), 83 bytes

s->java.util.Arrays.stream((" "+s).toLowerCase().split("\\W")).distinct().count()-1

Try it online!

Saved 17 bytes thanks to Neil.

Ruby -n, 32 bytes

p$_.upcase.scan(/\w+/).uniq.size

Attempt This Online!

Charcoal, 31 bytes

≔⦃⦄θFΦ⪪↧S⁻⪪γ¹⊞OE³⁶⍘ιφ_ι§≔θιιILθ

Attempt This Online! Link is to verbose version of code. Explanation: Inspired by @KevinCruijssen's 05AB1E answer, so assumes the input only contains printable ASCII.

≔⦃⦄θ

Start with an empty dictionary.

FΦ⪪↧S⁻⪪γ¹⊞OE³⁶⍘ιφ_ι

Split the lowercased input on all printable ASCII except the characters used to encode base 36 and _, and...

§≔θιι

... set each word as a key in the dictionary.

ILθ

Output the final length of the dictionary.

Retina 0.8.2, 15 bytes

T`L`l
D`\w+
\w+

Try it online! Link includes test cases. Explanation:

T`L`l

Convert to lower case.

D`\w+

Deduplicate words.

\w+

Count the number of remaining words.

13 bytes in Retina 1:

D$`\w+
$l
\w+

Try it online! Link includes test cases. Explanation:

D$`\w+
$l

Deduplicate words by lowercased value.

\w+

Count the number of remaining words.

05AB1E, 12 bytes

žQžjмS¡õKlÙg

Assumes the input will only contain printable ASCII characters.

Try it online or verify all test cases.

Explanation:

žQ            # Push the constant string with all printable ASCII characters
  žj          # Push the constant string with "a-zA-Z0-9_"
    м         # Remove all those characters
     S        # Convert the string to a list of characters
      ¡       # Split the (implicit) input-string by each of those characters
       õK     # Remove any empty strings from the list
         l    # Convert each word to lowercase
          Ù   # Uniquify the list of lowercase words
           g  # Pop and push the length
              # (which is output implicitly as result)

APL (Dyalog Extended), 27 bytes

f←{⍴∪(⎕A,⎕D,'_')(∊⍨⊆⊢)1⎕C⍵}

Try it online!

f←{⍴∪(⎕A,⎕D,'_')(∊⍨⊆⊢)1⎕C⍵}
                        1⎕C      to uppercase
     (⎕A,⎕D,'_')                 [A-Z0-9_]
     (⎕A,⎕D,'_')(∊⍨⊆⊢)1⎕C       Partition (⊆) using Membership (∊)
    ∪                             remove duplicates
   ⍴                              count words

Excel, 120 bytes

=LET(
    x,MID(UPPER(A1),ROW(A:A),1),
    ROWS(
        UNIQUE(
            TEXTSPLIT(A1,,
                IF((ABS(77.5-CODE(x&"Z"))<13)+1-ISERR(0+x)+(x="_")=0,x),1
            )
        )
    )
)

Python, 54 bytes

lambda s:len({*re.findall('\w+',s.lower())})
import re

Attempt This Online!

Vyxal, 7 bytes

`†`ẎɽUL

Try it Online!

`†`Ẏ    # find all matches of \w+
    ɽ   # to lowercase
     U  # uniquify
      L # length

With a flag:

Vyxal l, 6 bytes

`†`ẎɽU

Try it Online!

J, 23 bytes

'\w+'#@~.@rxall tolower

Uniquify and count word matches in lowercase input.

Attempt This Online!

Arturo, 35 bytes

$=>[match lower&{/\w+}|unique|size]

Try it

Thunno, \$ 11 \log_{256}(96) \approx \$ 9.05 bytes

u"\w+"AfZUL

Attempt This Online!

Same approach as basically every other answer.

u"\w+"AfZUL  # Implicit input
u            # Uppercase
 "\w+"Af     # Regex findall "\w+"
        ZU   # Uniquify
          L  # Length

PowerShell Core, 49 bytes

($args-split'\W'-ne''|%{$_|% *l*r}|sort|gu).Count

Try it online!

($args-split'\W'-ne''|%{$_|% *l*r}|sort|gu).Count # full function
 $args-split'\W'-ne''                             # Splits the input string on non words character and remove empty entries
                     |%{$_|% *l*r}                # Calls ToLower() on each of the words
                                  |sort|gu        # Get unique words, Get-Unique needs the list to be sorted to remove all duplicates
(                                         ).Count # Return the count

Zsh, 36 bytes

<<<${#${(u)=${1:l}//[^[:IDENT:]]/ }}

Try it online!

Fortunately, the [:IDENT:] character class is exactly the words we should keep.

<<<${#${(u)=${1:l}//[^[:IDENT:]]/ }}
            ${1:l}                    # lowercase string
      ${          //[^[:IDENT:]]/ }   # // replace non-[:IDENT:] with spaces
      ${   =                      }   # = split on $IFS (space/tab/newline
      ${(u)                       }   # keep first occurance of each word
   ${#                             }  # count
<<<                                   # print

JavaScript, 46 bytes

s=>new Set(s.toLowerCase().match(/\w+/g)).size

Try it online!

Bash + GNU utils, 28

grep -Eo \\w+|sort -fu|wc -l

Try it online!