RandomChad/community/scraping.js
2022-02-04 10:10:38 +01:00

154 lines
4.1 KiB
JavaScript

const wait = ( durationinMs=1000 ) => new Promise( resolve => setTimeout( resolve, durationinMs ) )
/* ///////////////////////////////
// Twitter
// scraping for signer.is
// /////////////////////////////*/
function get_address_from_base64( text ) {
const [ batch, base64 ] = text.match( /(?:https:\/\/signer.is\/#\/verify\/)(.*?)(?:(<\/)|(">)|($))/ ) || []
try {
const text = atob( base64 )
const json = JSON.parse( decodeURIComponent( text ) )
return json.claimed_signatory
} catch( e ) {
console.log( `Decoding error for ${ base64 } `, e )
return false
}
}
async function get_addresses_from_twitter_links( links ) {
const resolved_twitter_redirects = await Promise.all( links.map( url => fetch( url ).then( res => res.text() ) ) )
const addresses = resolved_twitter_redirects.map( get_address_from_base64 )
return addresses
}
async function get_address_from_twitter_link( link ) {
const resolved_twitter_redirect = await fetch( link ).then( res => res.text() )
const address = get_address_from_base64( resolved_twitter_redirect )
return address
}
async function scrape_signer_links_in_replies( ) {
const hrefs = document.querySelectorAll( 'a' )
const has_signer_is = [ ...hrefs ].filter( ( { innerText, ...rest } ) => {
return innerText.includes( 'signer.is/#/verify' )
} )
const signer_is_hrefs = has_signer_is.map( ( { href } ) => href )
const addresses = await get_addresses_from_twitter_links( signer_is_hrefs )
return addresses
}
async function scrape_signer_links_in_dm( ) {
const scroll_interval = 50
console.log( `This function runs for an indeterminate length, keep an eye on it and run get_addresses_from_twitter_links when results stagnate` )
function get_handle_from_element( element ) {
const [ match, handle ] = element.innerHTML.match( /(@.+?)(?:<\/)/ ) || []
if( handle ) return handle
else return false
}
const hits = []
const done = []
while( true ) {
const messages = document.querySelectorAll( '[aria-selected=false]' )
for (let i = messages.length - 1; i >= 0; i--) {
// Get the handle of the message we are trying
const handle = get_handle_from_element( messages[i] )
if( done.includes( handle ) ) continue
if( !messages[i].isConnected ) continue
// open the message panel and grab the link
messages[i].click()
await wait()
const links = document.querySelectorAll( 'a' )
const { href, ...rest } = [ ...links ].find( ( { innerText } ) => innerText.includes( 'signer.is/#/verify' ) ) || []
// Save the link and mark the handle as done of need be
if( href ) {
const address = await get_address_from_twitter_link( href ).catch( e => false )
hits.push( address )
}
done.push( handle )
document.querySelector( '[aria-label="Back"]' ).click()
await wait()
window.scrollBy(0, -scroll_interval)
}
console.log( `Checked ${ done.length } handles. Found: `, hits.join( '\n' ) )
await wait()
}
}
function discord_channel_scraping() {
const hrefs = document.querySelectorAll( 'a' )
const has_signer_is = [ ...hrefs ].filter( ( { innerText, ...rest } ) => {
return innerText.includes( 'signer.is/#/verify' )
} )
console.log( has_signer_is[0].href )
const signer_is_hrefs = has_signer_is.map( ( { title } ) => title )
const addresses = signer_is_hrefs.map( get_address_from_base64 )
console.log( addresses.join( '\n' ) )
}
/* ///////////////////////////////
// Function handlers
// /////////////////////////////*/
async function get_all_addressed_from_replies() {
let all = []
const scroll_interval = 300
console.log( 'This function will run in perpetuity because twitter does not let us access all tweets unless they are in view. Manually handle that.' )
while( true ) {
const addresses = await scrape_signer_links_in_replies()
let new_all = [ ...all, ...addresses ]
new_all = [ ...new Set( new_all ) ]
if( all.length != new_all.length ) console.log( new_all.join( '\n' ) )
all = new_all
window.scrollBy(0,scroll_interval)
await wait( 1000 )
}
console.log( `${ all.length } addresses: \n`, all.join( '\n' ) )
}
// get_all_addressed_from_replies( )
scrape_signer_links_in_dm( )